tabula-extractor 0.6.3-java → 0.6.4-java

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake'
6
6
  Bundler::GemHelper.install_tasks
7
7
 
8
8
  task :test do
9
- ruby %{-J-Xmx512m test/tests.rb}
9
+ ruby %{--debug -X-C -J-Xmx512m test/tests.rb}
10
10
  end
11
11
 
12
12
  task :default => [:test]
data/bin/tabula CHANGED
@@ -34,11 +34,13 @@ EOS
34
34
 
35
35
  opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
36
36
  opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
37
+ opt :password, 'Password to decrypt document. Default is empty', :default => ''
37
38
  opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
39
+ opt :debug, 'Print detected table areas instead of processing.'
38
40
  opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
39
41
  opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
40
42
  end
41
-
43
+
42
44
  if !opts[:area].nil?
43
45
  unless opts[:area].split(',').size == 4 \
44
46
  && opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ }
@@ -59,25 +61,29 @@ def main
59
61
 
60
62
  area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
61
63
  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
62
- extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
64
+ extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
63
65
  extractor.extract.each_with_index do |page, page_index|
64
- if opts[:guess]
66
+ if opts[:guess]
65
67
  lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
66
- page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
68
+ page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
67
69
  page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
68
70
  else
69
71
  page_areas = [area]
70
72
  end
71
73
 
72
74
  page_areas.each do |page_area|
73
- text = page.get_text( page_area )
74
- Tabula::Writers.send(opts[:format].to_sym,
75
- Tabula.make_table(text),
76
- out)
75
+ if opts[:guess] && opts[:debug]
76
+ puts (page_index + 1).to_s + ', ' + page_area.to_s
77
+ else
78
+ text = page.get_text( page_area )
79
+ Tabula::Writers.send(opts[:format].to_sym,
80
+ Tabula.make_table(text),
81
+ out)
82
+ end
83
+
77
84
  end
78
85
  end
79
86
  out.close
80
87
  end
81
88
 
82
89
  main
83
-
data/ext/Makefile.OSX CHANGED
@@ -1,9 +1,12 @@
1
1
  include Makefile.defaults
2
2
 
3
+
4
+ CFLAGS := -arch i386 -arch x86_64 -fPIC -O3 -g -Wall -Werror
5
+
3
6
  lib: lib$(NAME).$(VERSION).dylib
4
7
 
5
8
  lib$(NAME).$(VERSION).dylib: $(NAME).o
6
- $(CC) -dynamiclib -lm -o lib$(NAME).dylib $^
9
+ $(CC) -arch i386 -arch x86_64 -dynamiclib -lm -o lib$(NAME).dylib $^
7
10
 
8
11
  clean:
9
12
  $(RM) *.o
data/ext/liblsd.dylib CHANGED
Binary file
data/lib/tabula.rb CHANGED
@@ -10,3 +10,4 @@ require_relative './tabula/writers'
10
10
  require_relative './tabula/table_guesser'
11
11
  require_relative './tabula/line_segment_detector'
12
12
  require_relative './tabula/pdf_render'
13
+ #require_relative './tabula/whitespace'
@@ -10,7 +10,7 @@ module Enumerable
10
10
 
11
11
  def sample_variance
12
12
  m = self.mean
13
- sum = self.inject(0){|accum, i| accum +(i-m)**2 }
13
+ sum = self.inject(0) {|accum, i| accum + (i-m)**2 }
14
14
  sum/(self.length - 1).to_f
15
15
  end
16
16
 
@@ -18,4 +18,8 @@ module Enumerable
18
18
  return Math.sqrt(self.sample_variance)
19
19
  end
20
20
 
21
- end
21
+ def sorted?
22
+ each_cons(2).all? { |a, b| (a <=> b) <= 0 }
23
+ end
24
+
25
+ end
@@ -70,6 +70,22 @@ module Tabula
70
70
  intersection_area / union_area
71
71
  end
72
72
 
73
+ # as defined by PDF-TREX paper
74
+ def horizontal_overlap_ratio(other)
75
+ delta = [self.bottom - self.top, other.bottom - other.top].min
76
+ if [other.top, self.top, other.bottom, self.bottom].sorted?
77
+ (other.bottom - self.top) / delta
78
+ elsif [self.top, other.top, self.bottom, other.bottom].sorted?
79
+ (self.bottom - other.top) / delta
80
+ elsif [self.top, other.top, other.bottom, self.bottom].sorted?
81
+ (other.bottom - other.top) / delta
82
+ elsif [other.top, self.top, self.bottom, other.bottom].sorted?
83
+ (self.bottom - self.top) / delta
84
+ else
85
+ 0
86
+ end
87
+ end
88
+
73
89
  def to_h
74
90
  hash = {}
75
91
  [:top, :left, :width, :height].each do |m|
@@ -99,8 +115,8 @@ module Tabula
99
115
 
100
116
  # spaces are not detected, b/c they have height == 0
101
117
  # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
- # self.texts.select { |t| t.overlaps? ze }
103
- self.texts.select do |t|
118
+ # self.texts.select { |t| t.overlaps? ze }
119
+ self.texts.select do |t|
104
120
  t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
121
  end
106
122
  end
@@ -179,12 +195,32 @@ module Tabula
179
195
  end
180
196
  end
181
197
 
198
+ class Table
199
+ attr_reader :lines
200
+ def initialize(line_count, separators)
201
+ @separators = separators
202
+ @lines = (0...line_count).inject([]) { |m| m << Line.new }
203
+ end
204
+
205
+ def add_text_element(text_element, i, j)
206
+ if @lines.size <= i
207
+ @lines[i] = Line.new
208
+ end
209
+ if @lines[i].text_elements[j]
210
+ @lines[i].text_elements[j].merge!(text_element)
211
+ else
212
+ @lines[i].text_elements[j] = text_element
213
+ end
214
+ end
215
+ end
182
216
 
183
217
  class Line < ZoneEntity
184
218
  attr_accessor :text_elements
219
+ attr_reader :index
185
220
 
186
- def initialize
221
+ def initialize(index=nil)
187
222
  self.text_elements = []
223
+ @index = index
188
224
  end
189
225
 
190
226
  def <<(t)
@@ -5,6 +5,7 @@ require 'ffi'
5
5
 
6
6
  require_relative './entities'
7
7
  require_relative './pdf_render'
8
+ require_relative './pdf_dump'
8
9
  require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
9
10
 
10
11
  java_import javax.imageio.ImageIO
@@ -45,7 +46,7 @@ module Tabula
45
46
  def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
46
47
  options = DETECT_LINES_DEFAULTS.merge(options)
47
48
 
48
- pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
49
+ pdf_file = Extraction.openPDF(pdf_path)
49
50
  page = pdf_file.getDocumentCatalog.getAllPages[page_number]
50
51
  bi = Tabula::Render.pageToBufferedImage(page,
51
52
  options[:image_size])
@@ -62,9 +63,14 @@ module Tabula
62
63
  image
63
64
  elsif image.class == String
64
65
  ImageIO.read(java.io.File.new(image))
65
- else
66
+ else
66
67
  raise ArgumentError, 'image must be a string or a BufferedImage'
67
68
  end
69
+
70
+ ImageIO.write(bimage,
71
+ 'png',
72
+ java.io.File.new("/tmp/white.png"))
73
+
68
74
  image = LSD.image_to_image_double(bimage)
69
75
 
70
76
  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
@@ -7,9 +7,22 @@ require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
7
7
  java_import org.apache.pdfbox.pdfparser.PDFParser
8
8
  java_import org.apache.pdfbox.pdmodel.PDDocument
9
9
  java_import org.apache.pdfbox.util.PDFTextStripper
10
+ java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
10
11
 
11
12
  module Tabula
13
+
12
14
  module Extraction
15
+
16
+ def Extraction.openPDF(pdf_filename, password='')
17
+ raise Errno::ENOENT unless File.exists?(pdf_filename)
18
+ document = PDDocument.load(pdf_filename)
19
+ if document.isEncrypted
20
+ sdm = StandardDecryptionMaterial.new(password)
21
+ document.openProtection(sdm)
22
+ end
23
+ document
24
+ end
25
+
13
26
  class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
14
27
 
15
28
  attr_accessor :characters, :fonts
@@ -28,8 +41,9 @@ module Tabula
28
41
  end
29
42
 
30
43
 
44
+
31
45
  def processTextPosition(text)
32
- # return if text.getCharacter == ' '
46
+ # return if text.getCharacter == ' '
33
47
 
34
48
  # text_font = text.getFont
35
49
  # text_size = text.getFontSize
@@ -49,9 +63,8 @@ module Tabula
49
63
  end
50
64
 
51
65
  class PagesInfoExtractor
52
- def initialize(pdf_filename)
53
- raise Errno::ENOENT unless File.exists?(pdf_filename)
54
- @pdf_file = PDDocument.load(java.io.File.new(pdf_filename))
66
+ def initialize(pdf_filename, password='')
67
+ @pdf_file = Extraction.openPDF(pdf_filename, password)
55
68
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
56
69
  end
57
70
 
@@ -60,7 +73,7 @@ module Tabula
60
73
  begin
61
74
  @all_pages.each_with_index do |page, i|
62
75
  contents = page.getContents
63
- next if contents.nil?
76
+ # next if contents.nil?
64
77
  y.yield Tabula::Page.new(page.findCropBox.width,
65
78
  page.findCropBox.height,
66
79
  page.getRotation.to_i,
@@ -78,9 +91,9 @@ module Tabula
78
91
  include Observable
79
92
 
80
93
  #N.B. pages can be :all, a list of pages or a range.
81
- def initialize(pdf_filename, pages=[1])
94
+ def initialize(pdf_filename, pages=[1], password='')
82
95
  raise Errno::ENOENT unless File.exists?(pdf_filename)
83
- @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
96
+ @pdf_file = Extraction.openPDF(pdf_filename, password)
84
97
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
85
98
  @pages = pages == :all ? (1..@all_pages.size) : pages
86
99
  @extractor = TextExtractor.new
@@ -105,7 +118,7 @@ module Tabula
105
118
  char.getXDirAdj.round(2),
106
119
  char.getWidthDirAdj.round(2),
107
120
  char.getHeightDir.round(2),
108
- nil,
121
+ char.getFont,
109
122
  char.getFontSize.round(2),
110
123
  char.getCharacter,
111
124
  char.getWidthOfSpace)
@@ -115,12 +115,9 @@ module Tabula
115
115
 
116
116
  char2 = self.text_elements[i+1]
117
117
 
118
-
119
-
120
118
  next if char2.nil? or char1.nil?
121
119
 
122
120
  if self.text_elements[current_word_index].should_merge?(char2)
123
- #puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
124
121
  self.text_elements[current_word_index].merge!(char2)
125
122
  char1 = char2
126
123
  self.text_elements[i+1] = nil
@@ -166,92 +163,60 @@ module Tabula
166
163
 
167
164
  ONLY_SPACES_RE = Regexp.new('^\s+$')
168
165
 
169
- # Returns an array of Tabula::Line
170
- def Tabula.make_table(text_elements, options={})
171
- extractor = TableExtractor.new(text_elements, options)
172
-
173
- # group by lines
166
+ def Tabula.group_by_lines(text_elements)
174
167
  lines = []
175
- line_boundaries = extractor.get_line_boundaries
176
-
177
- # find all the text elements
178
- # contained within each detected line (table row) boundary
179
- line_boundaries.each do |lb|
180
- line = Line.new
181
-
182
- line_members = text_elements.find_all do |te|
183
- te.vertically_overlaps?(lb)
184
- end
185
-
186
- text_elements -= line_members
187
-
188
- line_members.sort_by(&:left).each do |te|
189
- # skip text_elements that only contain spaces
190
- next if te.text =~ ONLY_SPACES_RE
191
- line << te
168
+ text_elements.each do |te|
169
+ next if te.text =~ ONLY_SPACES_RE
170
+ l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
171
+ if l.nil?
172
+ l = Line.new
173
+ lines << l
192
174
  end
193
-
194
- lines << line if line.text_elements.size > 0
175
+ l << te
195
176
  end
177
+ lines
178
+ end
196
179
 
197
- lines.sort_by!(&:top)
198
-
199
- columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
200
-
201
- # # insert empty cells if needed
202
- lines.each_with_index do |l, line_index|
203
- next if l.text_elements.nil?
204
- l.text_elements.compact! # TODO WHY do I have to do this?
205
- l.text_elements.uniq! # TODO WHY do I have to do this?
206
- l.text_elements.sort_by!(&:left)
207
-
208
- #next unless l.text_elements.size < columns.size
209
-
210
- columns.each_with_index do |c, i|
211
- if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
212
- l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
180
+ # Returns an array of Tabula::Line
181
+ def Tabula.make_table(text_elements, options={})
182
+ default_options = {:separators => []}
183
+ options = default_options.merge(options)
184
+
185
+ extractor = TableExtractor.new(text_elements, options).text_elements
186
+ lines = group_by_lines(text_elements)
187
+ top = lines[0].text_elements.map(&:top).min
188
+ right = 0
189
+ columns = []
190
+
191
+ text_elements.sort_by(&:left).each do |te|
192
+ next if te.text =~ ONLY_SPACES_RE
193
+ if te.top >= top
194
+ left = te.left
195
+ if (left > right)
196
+ columns << right
197
+ right = te.right
198
+ elsif te.right > right
199
+ right = te.right
213
200
  end
214
201
  end
215
202
  end
216
203
 
217
- # # merge elements that are in the same column
218
- lines.each_with_index do |l, line_index|
219
- next if l.text_elements.nil?
220
-
221
- (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
222
- next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
204
+ separators = columns[1..-1].sort.reverse
223
205
 
224
- # if same column...
225
- if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
226
- == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
227
- if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
228
- l.text_elements[t1].merge!(l.text_elements[t2])
229
- l.text_elements[t2] = nil
230
- else
231
- l.text_elements[t2].merge!(l.text_elements[t1])
232
- l.text_elements[t1] = nil
233
- end
234
- end
206
+ table = Table.new(lines.count, separators)
207
+ lines.each_with_index do |line, i|
208
+ line.text_elements.each do |te|
209
+ j = separators.find_index { |s| te.left > s } || separators.count
210
+ table.add_text_element(te, i, separators.count - j)
235
211
  end
236
-
237
- l.text_elements.compact!
238
212
  end
239
213
 
240
- # remove duplicate lines
241
- # TODO this shouldn't have happened here, check why we have to do
242
- # this (maybe duplication is happening in the column merging phase?)
243
- (0..lines.size - 2).each do |i|
244
- next if lines[i].nil?
245
- # if any of the elements on the next line is duplicated, kill
246
- # the next line
247
- if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
248
- lines[i+1] = nil
249
- end
214
+ table.lines.map do |l|
215
+ l.text_elements.map! { |te|
216
+ te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
217
+ }
250
218
  end
251
219
 
252
- lines.compact.map do |line|
253
- line.text_elements.sort_by(&:left)
254
- end
255
220
  end
256
221
 
257
222
 
@@ -340,9 +305,4 @@ module Tabula
340
305
  line.text_elements.sort_by(&:left)
341
306
  end
342
307
  end
343
-
344
-
345
-
346
-
347
-
348
308
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.3'
2
+ VERSION = '0.6.4'
3
3
  end
@@ -15,7 +15,7 @@ module Tabula
15
15
  end
16
16
 
17
17
  def Writers.TSV(lines, output=$stdout)
18
- tsv_string = lines.each { |l|
18
+ lines.each { |l|
19
19
  output.write(l.map(&:text).join("\t") + "\n")
20
20
  }
21
21
  end
@@ -22,6 +22,8 @@ Gem::Specification.new do |s|
22
22
 
23
23
  s.add_development_dependency 'minitest'
24
24
  s.add_development_dependency 'bundler', '>= 1.3.4'
25
+ s.add_development_dependency 'ruby-debug'
25
26
 
26
27
  s.add_runtime_dependency "trollop", ["~> 2.0"]
28
+ # s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
27
29
  end
data/test/tests.rb CHANGED
@@ -114,7 +114,6 @@ class TestExtractor < Minitest::Test
114
114
  lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
115
115
  vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
116
116
 
117
-
118
117
  characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
119
118
  #top left bottom right
120
119
  expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.3
5
+ version: 0.6.4
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-06-29 00:00:00.000000000 Z
14
+ date: 2013-07-09 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
@@ -45,6 +45,22 @@ dependencies:
45
45
  none: false
46
46
  prerelease: false
47
47
  type: :development
48
+ - !ruby/object:Gem::Dependency
49
+ name: ruby-debug
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ none: false
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ none: false
62
+ prerelease: false
63
+ type: :development
48
64
  - !ruby/object:Gem::Dependency
49
65
  name: trollop
50
66
  version_requirements: !ruby/object:Gem::Requirement
@@ -126,12 +142,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
126
142
  requirements:
127
143
  - - '>='
128
144
  - !ruby/object:Gem::Version
145
+ segments:
146
+ - 0
147
+ hash: 2
129
148
  version: '0'
130
149
  none: false
131
150
  required_rubygems_version: !ruby/object:Gem::Requirement
132
151
  requirements:
133
152
  - - '>='
134
153
  - !ruby/object:Gem::Version
154
+ segments:
155
+ - 0
156
+ hash: 2
135
157
  version: '0'
136
158
  none: false
137
159
  requirements: []