tabula-extractor 0.6.3-java → 0.6.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake'
6
6
  Bundler::GemHelper.install_tasks
7
7
 
8
8
  task :test do
9
- ruby %{-J-Xmx512m test/tests.rb}
9
+ ruby %{--debug -X-C -J-Xmx512m test/tests.rb}
10
10
  end
11
11
 
12
12
  task :default => [:test]
data/bin/tabula CHANGED
@@ -34,11 +34,13 @@ EOS
34
34
 
35
35
  opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
36
36
  opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
37
+ opt :password, 'Password to decrypt document. Default is empty', :default => ''
37
38
  opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
39
+ opt :debug, 'Print detected table areas instead of processing.'
38
40
  opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
39
41
  opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
40
42
  end
41
-
43
+
42
44
  if !opts[:area].nil?
43
45
  unless opts[:area].split(',').size == 4 \
44
46
  && opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ }
@@ -59,25 +61,29 @@ def main
59
61
 
60
62
  area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
61
63
  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
62
- extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
64
+ extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
63
65
  extractor.extract.each_with_index do |page, page_index|
64
- if opts[:guess]
66
+ if opts[:guess]
65
67
  lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
66
- page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
68
+ page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
67
69
  page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
68
70
  else
69
71
  page_areas = [area]
70
72
  end
71
73
 
72
74
  page_areas.each do |page_area|
73
- text = page.get_text( page_area )
74
- Tabula::Writers.send(opts[:format].to_sym,
75
- Tabula.make_table(text),
76
- out)
75
+ if opts[:guess] && opts[:debug]
76
+ puts (page_index + 1).to_s + ', ' + page_area.to_s
77
+ else
78
+ text = page.get_text( page_area )
79
+ Tabula::Writers.send(opts[:format].to_sym,
80
+ Tabula.make_table(text),
81
+ out)
82
+ end
83
+
77
84
  end
78
85
  end
79
86
  out.close
80
87
  end
81
88
 
82
89
  main
83
-
data/ext/Makefile.OSX CHANGED
@@ -1,9 +1,12 @@
1
1
  include Makefile.defaults
2
2
 
3
+
4
+ CFLAGS := -arch i386 -arch x86_64 -fPIC -O3 -g -Wall -Werror
5
+
3
6
  lib: lib$(NAME).$(VERSION).dylib
4
7
 
5
8
  lib$(NAME).$(VERSION).dylib: $(NAME).o
6
- $(CC) -dynamiclib -lm -o lib$(NAME).dylib $^
9
+ $(CC) -arch i386 -arch x86_64 -dynamiclib -lm -o lib$(NAME).dylib $^
7
10
 
8
11
  clean:
9
12
  $(RM) *.o
data/ext/liblsd.dylib CHANGED
Binary file
data/lib/tabula.rb CHANGED
@@ -10,3 +10,4 @@ require_relative './tabula/writers'
10
10
  require_relative './tabula/table_guesser'
11
11
  require_relative './tabula/line_segment_detector'
12
12
  require_relative './tabula/pdf_render'
13
+ #require_relative './tabula/whitespace'
@@ -10,7 +10,7 @@ module Enumerable
10
10
 
11
11
  def sample_variance
12
12
  m = self.mean
13
- sum = self.inject(0){|accum, i| accum +(i-m)**2 }
13
+ sum = self.inject(0) {|accum, i| accum + (i-m)**2 }
14
14
  sum/(self.length - 1).to_f
15
15
  end
16
16
 
@@ -18,4 +18,8 @@ module Enumerable
18
18
  return Math.sqrt(self.sample_variance)
19
19
  end
20
20
 
21
- end
21
+ def sorted?
22
+ each_cons(2).all? { |a, b| (a <=> b) <= 0 }
23
+ end
24
+
25
+ end
@@ -70,6 +70,22 @@ module Tabula
70
70
  intersection_area / union_area
71
71
  end
72
72
 
73
+ # as defined by PDF-TREX paper
74
+ def horizontal_overlap_ratio(other)
75
+ delta = [self.bottom - self.top, other.bottom - other.top].min
76
+ if [other.top, self.top, other.bottom, self.bottom].sorted?
77
+ (other.bottom - self.top) / delta
78
+ elsif [self.top, other.top, self.bottom, other.bottom].sorted?
79
+ (self.bottom - other.top) / delta
80
+ elsif [self.top, other.top, other.bottom, self.bottom].sorted?
81
+ (other.bottom - other.top) / delta
82
+ elsif [other.top, self.top, self.bottom, other.bottom].sorted?
83
+ (self.bottom - self.top) / delta
84
+ else
85
+ 0
86
+ end
87
+ end
88
+
73
89
  def to_h
74
90
  hash = {}
75
91
  [:top, :left, :width, :height].each do |m|
@@ -99,8 +115,8 @@ module Tabula
99
115
 
100
116
  # spaces are not detected, b/c they have height == 0
101
117
  # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
- # self.texts.select { |t| t.overlaps? ze }
103
- self.texts.select do |t|
118
+ # self.texts.select { |t| t.overlaps? ze }
119
+ self.texts.select do |t|
104
120
  t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
121
  end
106
122
  end
@@ -179,12 +195,32 @@ module Tabula
179
195
  end
180
196
  end
181
197
 
198
+ class Table
199
+ attr_reader :lines
200
+ def initialize(line_count, separators)
201
+ @separators = separators
202
+ @lines = (0...line_count).inject([]) { |m| m << Line.new }
203
+ end
204
+
205
+ def add_text_element(text_element, i, j)
206
+ if @lines.size <= i
207
+ @lines[i] = Line.new
208
+ end
209
+ if @lines[i].text_elements[j]
210
+ @lines[i].text_elements[j].merge!(text_element)
211
+ else
212
+ @lines[i].text_elements[j] = text_element
213
+ end
214
+ end
215
+ end
182
216
 
183
217
  class Line < ZoneEntity
184
218
  attr_accessor :text_elements
219
+ attr_reader :index
185
220
 
186
- def initialize
221
+ def initialize(index=nil)
187
222
  self.text_elements = []
223
+ @index = index
188
224
  end
189
225
 
190
226
  def <<(t)
@@ -5,6 +5,7 @@ require 'ffi'
5
5
 
6
6
  require_relative './entities'
7
7
  require_relative './pdf_render'
8
+ require_relative './pdf_dump'
8
9
  require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
9
10
 
10
11
  java_import javax.imageio.ImageIO
@@ -45,7 +46,7 @@ module Tabula
45
46
  def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
46
47
  options = DETECT_LINES_DEFAULTS.merge(options)
47
48
 
48
- pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
49
+ pdf_file = Extraction.openPDF(pdf_path)
49
50
  page = pdf_file.getDocumentCatalog.getAllPages[page_number]
50
51
  bi = Tabula::Render.pageToBufferedImage(page,
51
52
  options[:image_size])
@@ -62,9 +63,14 @@ module Tabula
62
63
  image
63
64
  elsif image.class == String
64
65
  ImageIO.read(java.io.File.new(image))
65
- else
66
+ else
66
67
  raise ArgumentError, 'image must be a string or a BufferedImage'
67
68
  end
69
+
70
+ ImageIO.write(bimage,
71
+ 'png',
72
+ java.io.File.new("/tmp/white.png"))
73
+
68
74
  image = LSD.image_to_image_double(bimage)
69
75
 
70
76
  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
@@ -7,9 +7,22 @@ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
7
7
  java_import org.apache.pdfbox.pdfparser.PDFParser
8
8
  java_import org.apache.pdfbox.pdmodel.PDDocument
9
9
  java_import org.apache.pdfbox.util.PDFTextStripper
10
+ java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
10
11
 
11
12
  module Tabula
13
+
12
14
  module Extraction
15
+
16
+ def Extraction.openPDF(pdf_filename, password='')
17
+ raise Errno::ENOENT unless File.exists?(pdf_filename)
18
+ document = PDDocument.load(pdf_filename)
19
+ if document.isEncrypted
20
+ sdm = StandardDecryptionMaterial.new(password)
21
+ document.openProtection(sdm)
22
+ end
23
+ document
24
+ end
25
+
13
26
  class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
14
27
 
15
28
  attr_accessor :characters, :fonts
@@ -28,8 +41,9 @@ module Tabula
28
41
  end
29
42
 
30
43
 
44
+
31
45
  def processTextPosition(text)
32
- # return if text.getCharacter == ' '
46
+ # return if text.getCharacter == ' '
33
47
 
34
48
  # text_font = text.getFont
35
49
  # text_size = text.getFontSize
@@ -49,9 +63,8 @@ module Tabula
49
63
  end
50
64
 
51
65
  class PagesInfoExtractor
52
- def initialize(pdf_filename)
53
- raise Errno::ENOENT unless File.exists?(pdf_filename)
54
- @pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
66
+ def initialize(pdf_filename, password='')
67
+ @pdf_file = Extraction.openPDF(pdf_filename, password)
55
68
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
56
69
  end
57
70
 
@@ -60,7 +73,7 @@ module Tabula
60
73
  begin
61
74
  @all_pages.each_with_index do |page, i|
62
75
  contents = page.getContents
63
- next if contents.nil?
76
+ # next if contents.nil?
64
77
  y.yield Tabula::Page.new(page.findCropBox.width,
65
78
  page.findCropBox.height,
66
79
  page.getRotation.to_i,
@@ -78,9 +91,9 @@ module Tabula
78
91
  include Observable
79
92
 
80
93
  #N.B. pages can be :all, a list of pages or a range.
81
- def initialize(pdf_filename, pages=[1])
94
+ def initialize(pdf_filename, pages=[1], password='')
82
95
  raise Errno::ENOENT unless File.exists?(pdf_filename)
83
- @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
96
+ @pdf_file = Extraction.openPDF(pdf_filename, password)
84
97
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
85
98
  @pages = pages == :all ? (1..@all_pages.size) : pages
86
99
  @extractor = TextExtractor.new
@@ -105,7 +118,7 @@ module Tabula
105
118
  char.getXDirAdj.round(2),
106
119
  char.getWidthDirAdj.round(2),
107
120
  char.getHeightDir.round(2),
108
- nil,
121
+ char.getFont,
109
122
  char.getFontSize.round(2),
110
123
  char.getCharacter,
111
124
  char.getWidthOfSpace)
@@ -115,12 +115,9 @@ module Tabula
115
115
 
116
116
  char2 = self.text_elements[i+1]
117
117
 
118
-
119
-
120
118
  next if char2.nil? or char1.nil?
121
119
 
122
120
  if self.text_elements[current_word_index].should_merge?(char2)
123
- #puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
124
121
  self.text_elements[current_word_index].merge!(char2)
125
122
  char1 = char2
126
123
  self.text_elements[i+1] = nil
@@ -166,92 +163,60 @@ module Tabula
166
163
 
167
164
  ONLY_SPACES_RE = Regexp.new('^\s+$')
168
165
 
169
- # Returns an array of Tabula::Line
170
- def Tabula.make_table(text_elements, options={})
171
- extractor = TableExtractor.new(text_elements, options)
172
-
173
- # group by lines
166
+ def Tabula.group_by_lines(text_elements)
174
167
  lines = []
175
- line_boundaries = extractor.get_line_boundaries
176
-
177
- # find all the text elements
178
- # contained within each detected line (table row) boundary
179
- line_boundaries.each do |lb|
180
- line = Line.new
181
-
182
- line_members = text_elements.find_all do |te|
183
- te.vertically_overlaps?(lb)
184
- end
185
-
186
- text_elements -= line_members
187
-
188
- line_members.sort_by(&:left).each do |te|
189
- # skip text_elements that only contain spaces
190
- next if te.text =~ ONLY_SPACES_RE
191
- line << te
168
+ text_elements.each do |te|
169
+ next if te.text =~ ONLY_SPACES_RE
170
+ l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
171
+ if l.nil?
172
+ l = Line.new
173
+ lines << l
192
174
  end
193
-
194
- lines << line if line.text_elements.size > 0
175
+ l << te
195
176
  end
177
+ lines
178
+ end
196
179
 
197
- lines.sort_by!(&:top)
198
-
199
- columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
200
-
201
- # # insert empty cells if needed
202
- lines.each_with_index do |l, line_index|
203
- next if l.text_elements.nil?
204
- l.text_elements.compact! # TODO WHY do I have to do this?
205
- l.text_elements.uniq! # TODO WHY do I have to do this?
206
- l.text_elements.sort_by!(&:left)
207
-
208
- #next unless l.text_elements.size < columns.size
209
-
210
- columns.each_with_index do |c, i|
211
- if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
212
- l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
180
+ # Returns an array of Tabula::Line
181
+ def Tabula.make_table(text_elements, options={})
182
+ default_options = {:separators => []}
183
+ options = default_options.merge(options)
184
+
185
+ extractor = TableExtractor.new(text_elements, options).text_elements
186
+ lines = group_by_lines(text_elements)
187
+ top = lines[0].text_elements.map(&:top).min
188
+ right = 0
189
+ columns = []
190
+
191
+ text_elements.sort_by(&:left).each do |te|
192
+ next if te.text =~ ONLY_SPACES_RE
193
+ if te.top >= top
194
+ left = te.left
195
+ if (left > right)
196
+ columns << right
197
+ right = te.right
198
+ elsif te.right > right
199
+ right = te.right
213
200
  end
214
201
  end
215
202
  end
216
203
 
217
- # # merge elements that are in the same column
218
- lines.each_with_index do |l, line_index|
219
- next if l.text_elements.nil?
220
-
221
- (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
222
- next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
204
+ separators = columns[1..-1].sort.reverse
223
205
 
224
- # if same column...
225
- if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
226
- == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
227
- if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
228
- l.text_elements[t1].merge!(l.text_elements[t2])
229
- l.text_elements[t2] = nil
230
- else
231
- l.text_elements[t2].merge!(l.text_elements[t1])
232
- l.text_elements[t1] = nil
233
- end
234
- end
206
+ table = Table.new(lines.count, separators)
207
+ lines.each_with_index do |line, i|
208
+ line.text_elements.each do |te|
209
+ j = separators.find_index { |s| te.left > s } || separators.count
210
+ table.add_text_element(te, i, separators.count - j)
235
211
  end
236
-
237
- l.text_elements.compact!
238
212
  end
239
213
 
240
- # remove duplicate lines
241
- # TODO this shouldn't have happened here, check why we have to do
242
- # this (maybe duplication is happening in the column merging phase?)
243
- (0..lines.size - 2).each do |i|
244
- next if lines[i].nil?
245
- # if any of the elements on the next line is duplicated, kill
246
- # the next line
247
- if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
248
- lines[i+1] = nil
249
- end
214
+ table.lines.map do |l|
215
+ l.text_elements.map! { |te|
216
+ te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
217
+ }
250
218
  end
251
219
 
252
- lines.compact.map do |line|
253
- line.text_elements.sort_by(&:left)
254
- end
255
220
  end
256
221
 
257
222
 
@@ -340,9 +305,4 @@ module Tabula
340
305
  line.text_elements.sort_by(&:left)
341
306
  end
342
307
  end
343
-
344
-
345
-
346
-
347
-
348
308
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.3'
2
+ VERSION = '0.6.4'
3
3
  end
@@ -15,7 +15,7 @@ module Tabula
15
15
  end
16
16
 
17
17
  def Writers.TSV(lines, output=$stdout)
18
- tsv_string = lines.each { |l|
18
+ lines.each { |l|
19
19
  output.write(l.map(&:text).join("\t") + "\n")
20
20
  }
21
21
  end
@@ -22,6 +22,8 @@ Gem::Specification.new do |s|
22
22
 
23
23
  s.add_development_dependency 'minitest'
24
24
  s.add_development_dependency 'bundler', '>= 1.3.4'
25
+ s.add_development_dependency 'ruby-debug'
25
26
 
26
27
  s.add_runtime_dependency "trollop", ["~> 2.0"]
28
+ # s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
27
29
  end
data/test/tests.rb CHANGED
@@ -114,7 +114,6 @@ class TestExtractor < Minitest::Test
114
114
  lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
115
115
  vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
116
116
 
117
-
118
117
  characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
119
118
  #top left bottom right
120
119
  expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.3
5
+ version: 0.6.4
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-06-29 00:00:00.000000000 Z
14
+ date: 2013-07-09 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
@@ -45,6 +45,22 @@ dependencies:
45
45
  none: false
46
46
  prerelease: false
47
47
  type: :development
48
+ - !ruby/object:Gem::Dependency
49
+ name: ruby-debug
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ none: false
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ none: false
62
+ prerelease: false
63
+ type: :development
48
64
  - !ruby/object:Gem::Dependency
49
65
  name: trollop
50
66
  version_requirements: !ruby/object:Gem::Requirement
@@ -126,12 +142,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
126
142
  requirements:
127
143
  - - '>='
128
144
  - !ruby/object:Gem::Version
145
+ segments:
146
+ - 0
147
+ hash: 2
129
148
  version: '0'
130
149
  none: false
131
150
  required_rubygems_version: !ruby/object:Gem::Requirement
132
151
  requirements:
133
152
  - - '>='
134
153
  - !ruby/object:Gem::Version
154
+ segments:
155
+ - 0
156
+ hash: 2
135
157
  version: '0'
136
158
  none: false
137
159
  requirements: []