tabula-extractor 0.7.2-java → 0.7.4-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
@@ -0,0 +1,42 @@
1
+ module Tabula
2
+ module AbstractInterface
3
+
4
+ class InterfaceNotImplementedError < NoMethodError
5
+ end
6
+
7
+ def self.included(klass)
8
+ klass.send(:include, AbstractInterface::Methods)
9
+ klass.send(:extend, AbstractInterface::Methods)
10
+ end
11
+
12
+ module Methods
13
+ def api_not_implemented(klass)
14
+ caller.first.match(/in \`(.+)\'/)
15
+ method_name = $1
16
+ raise AbstractInterface::InterfaceNotImplementedError.new("#{klass.class.name} needs to implement '#{method_name}' for interface #{self.name}!")
17
+ end
18
+ end
19
+ end
20
+
21
+
22
+ module Tabular
23
+ include AbstractInterface
24
+ # this is a pseudo-interface as described here:
25
+ # http://metabates.com/2011/02/07/building-interfaces-and-abstract-classes-in-ruby/
26
+ # Table and Spreadsheet implement this interface, so should any class
27
+ # intended to represent tabular data from a PDF, e.g. if another extraction
28
+ # method were created, so that Tabula GUI and API can correctly handle
29
+ # its data.
30
+
31
+ def extraction_method; raise Tabular.api_not_implemented(self); end
32
+
33
+ def page; Tabular.api_not_implemented(self); end
34
+ def rows; Tabular.api_not_implemented(self); end
35
+ def cols; Tabular.api_not_implemented(self); end
36
+
37
+ def to_csv; Tabular.api_not_implemented(self); end
38
+ def to_tsv; Tabular.api_not_implemented(self); end
39
+ def to_a; Tabular.api_not_implemented(self); end
40
+ def to_json; Tabular.api_not_implemented(self); end
41
+ end
42
+ end
@@ -8,46 +8,71 @@ module Tabula
8
8
  # initialize a new TextChunk from a TextElement
9
9
  def self.create_from_text_element(text_element)
10
10
  raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
11
- tc = self.new(text_element.top, text_element.left, text_element.width, text_element.height)
11
+ tc = self.new(*text_element.tlwh)
12
12
  tc.text_elements = [text_element]
13
13
  return tc
14
14
  end
15
15
 
16
- ##
17
- # group an iterable of TextChunk into a list of Line
18
16
  def self.group_by_lines(text_chunks)
19
- lines = []
20
- text_chunks.each do |te|
21
- next if te.text =~ ONLY_SPACES_RE
22
- l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
23
- if l.nil?
24
- l = Line.new
25
- lines << l
17
+ bbwidth = text_chunks.max_by(&:right).right - text_chunks.min_by(&:left).left
18
+
19
+ l = Line.new
20
+ l << text_chunks.first
21
+
22
+ lines = text_chunks[1..-1].inject([l]) do |lines, te|
23
+ if lines.last.horizontal_overlap_ratio(te) < 0.01
24
+ # skip lines such that:
25
+ # - are wider than the 90% of the width of the text_chunks bounding box
26
+ # - it contains a single repeated character
27
+ if lines.last.width / bbwidth > 0.9 \
28
+ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE }
29
+ lines.pop
30
+ end
31
+ lines << Line.new
26
32
  end
27
- l << te
33
+ lines.last << te
34
+ lines
28
35
  end
29
- lines
36
+
37
+ if lines.last.width / bbwidth > 0.9 \
38
+ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE }
39
+ lines.pop
40
+ end
41
+
42
+ lines.map!(&:remove_sequential_spaces!)
30
43
  end
31
44
 
32
45
  ##
33
- # calculate estimated columns from an iterable of TextChunk
34
- def self.column_positions(top, text_chunks)
35
- right = 0
36
- columns = []
37
-
38
- text_chunks.each do |te|
39
- next if te.text =~ ONLY_SPACES_RE
40
- if te.top >= top
41
- left = te.left
42
- if (left > right)
43
- columns << right
44
- right = te.right
45
- elsif te.right > right
46
- right = te.right
46
+ # returns a list of column boundaries (x axis)
47
+ # +lines+ must be an array of lines sorted by their +top+ attribute
48
+ def self.column_positions(lines)
49
+ init = lines.first.text_elements.inject([]) { |memo, text_chunk|
50
+ next memo if text_chunk.text =~ ONLY_SPACES_RE
51
+ memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
52
+ memo
53
+ }
54
+
55
+ regions = lines[1..-1]
56
+ .inject(init) do |column_regions, line|
57
+
58
+ line_text_elements = line.text_elements.clone.select { |te| te.text !~ ONLY_SPACES_RE }
59
+
60
+ column_regions.each do |cr|
61
+
62
+ overlaps = line_text_elements
63
+ .select { |te| te.text !~ ONLY_SPACES_RE && cr.horizontally_overlaps?(te) }
64
+
65
+ overlaps.inject(cr) do |memo, te|
66
+ cr.merge!(te)
47
67
  end
68
+
69
+ line_text_elements = line_text_elements - overlaps
48
70
  end
71
+
72
+ column_regions += line_text_elements.map { |te| Tabula::ZoneEntity.new(*te.tlwh) }
49
73
  end
50
- columns
74
+
75
+ regions.map { |r| r.right.round(2) }.uniq
51
76
  end
52
77
 
53
78
  ##
@@ -59,10 +84,10 @@ module Tabula
59
84
 
60
85
  def merge!(other)
61
86
  if other.instance_of?(TextChunk)
62
- if self.horizontally_overlaps?(other) && other.top < self.top
63
- self.text_elements = other.text_elements + self.text_elements
64
- else
87
+ if (self <=> other) < 0
65
88
  self.text_elements = self.text_elements + other.text_elements
89
+ else
90
+ self.text_elements = other.text_elements + self.text_elements
66
91
  end
67
92
  end
68
93
  super(other)
@@ -75,28 +100,6 @@ module Tabula
75
100
  raise "Not Implemented"
76
101
  end
77
102
 
78
- ##
79
- # remove leading and trailing whitespace
80
- # (changes geometry accordingly)
81
- # TODO horrible implementation - fix.
82
- def strip!
83
- acc = 0
84
- new_te = self.text_elements.drop_while { |te|
85
- te.text == ' ' && acc += 1
86
- }
87
- self.left += self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
88
- self.text_elements = new_te
89
-
90
- self.text_elements.reverse!
91
- acc = 0
92
- new_te = self.text_elements.drop_while { |te|
93
- te.text == ' ' && acc += 1
94
- }
95
- self.right -= self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
96
- self.text_elements = new_te.reverse
97
- self
98
- end
99
-
100
103
  def text
101
104
  self.text_elements.map(&:text).join
102
105
  end
@@ -1,4 +1,6 @@
1
+ # -*- coding: utf-8 -*-
1
2
  module Tabula
3
+
2
4
  ##
3
5
  # a Glyph
4
6
  class TextElement < ZoneEntity
@@ -17,8 +19,20 @@ module Tabula
17
19
 
18
20
  EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
19
21
 
22
+ def self.within(first, second, variance )
23
+ second < first + variance && second > first - variance
24
+ end
25
+
26
+ def self.overlap(y1, height1, y2, height2, variance=0.1)
27
+ within( y1, y2, variance) || (y2 <= y1 && y2 >= y1 - height1) \
28
+ || (y1 <= y2 && y1 >= y2-height2)
29
+ end
30
+
31
+
20
32
  ##
21
33
  # heuristically merge an iterable of TextElement into a list of TextChunk
34
+ # lots of ideas taken from PDFBox's PDFTextStripper.writePage
35
+ # here be dragons
22
36
  def self.merge_words(text_elements, options={})
23
37
  default_options = {:vertical_rulings => []}
24
38
  options = default_options.merge(options)
@@ -28,74 +42,138 @@ module Tabula
28
42
 
29
43
  text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
30
44
 
45
+
46
+ previousAveCharWidth = text_chunks.first.width
47
+ endOfLastTextX = text_chunks.first.right
48
+ maxYForLine = text_chunks.first.bottom
49
+ maxHeightForLine = text_chunks.first.height
50
+ minYTopForLine = text_chunks.first.top
51
+ lastWordSpacing = -1
52
+ sp = nil
53
+
31
54
  text_elements.inject(text_chunks) do |chunks, char|
55
+
32
56
  current_chunk = chunks.last
33
57
  prev_char = current_chunk.text_elements.last
34
58
 
59
+ # Resets the average character width when we see a change in font
60
+ # or a change in the font size
61
+ if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
62
+ previousAveCharWidth = -1;
63
+ end
64
+
35
65
  # if same char AND overlapped, skip
36
- if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
37
- chunks
66
+ if (prev_char.text == char.text) && prev_char.overlaps_with_ratio?(char, 0.5)
67
+ next chunks
68
+ end
69
+
70
+ # if char is a space that overlaps with the prev_char, skip
71
+ if char.text == ' ' && prev_char.left == char.left && prev_char.top == char.top
72
+ next chunks
73
+ end
74
+
75
+ # any vertical ruling goes across prev_char and char?
76
+ across_vertical_ruling = vertical_ruling_locations.any? { |loc|
77
+ prev_char.left < loc && char.left > loc
78
+ }
79
+
80
+ # Estimate the expected width of the space based on the
81
+ # space character with some margin.
82
+ wordSpacing = char.width_of_space
83
+ deltaSpace = 0
84
+ deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
85
+ ::Float::MAX
86
+ elsif lastWordSpacing < 0
87
+ wordSpacing * 0.5 # 0.5 == spacingTolerance
88
+ else
89
+ ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
90
+ end
91
+
92
+ # Estimate the expected width of the space based on the
93
+ # average character width with some margin. This calculation does not
94
+ # make a true average (average of averages) but we found that it gave the
95
+ # best results after numerous experiments. Based on experiments we also found that
96
+ # .3 worked well.
97
+ averageCharWidth = if previousAveCharWidth < 0
98
+ char.width / char.text.size
99
+ else
100
+ (previousAveCharWidth + (char.width / char.text.size)) / 2.0
101
+ end
102
+ deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
103
+
104
+ # Compares the values obtained by the average method and the wordSpacing method and picks
105
+ # the smaller number.
106
+ expectedStartOfNextWordX = -::Float::MAX
107
+
108
+ if endOfLastTextX != -1
109
+ expectedStartOfNextWordX = endOfLastTextX + [deltaCharWidth, deltaSpace].min
110
+ end
111
+
112
+ sameLine = true
113
+ if !overlap(char.bottom, char.height, maxYForLine, maxHeightForLine)
114
+ endOfLastTextX = -1
115
+ expectedStartOfNextWordX = -::Float::MAX
116
+ maxYForLine = -::Float::MAX
117
+ maxHeightForLine = -1
118
+ minYTopForLine = ::Float::MAX
119
+ sameLine = false
120
+ end
121
+
122
+ endOfLastTextX = char.right
123
+ # should we add a space?
124
+ if !across_vertical_ruling \
125
+ && sameLine \
126
+ && expectedStartOfNextWordX < char.left \
127
+ && !prev_char.text.end_with?(' ')
128
+
129
+ sp = self.new(prev_char.top,
130
+ prev_char.right,
131
+ expectedStartOfNextWordX - prev_char.right,
132
+ prev_char.height,
133
+ prev_char.font,
134
+ prev_char.font_size,
135
+ ' ',
136
+ prev_char.width_of_space)
137
+ current_chunk << sp
38
138
  else
39
- # any vertical ruling goes across prev_char and char?
40
- across_vertical_ruling = vertical_ruling_locations.any? { |loc|
41
- prev_char.left < loc && char.left > loc
42
- }
43
-
44
- # should we add a space?
45
- if (prev_char.text != " ") && (char.text != " ") \
46
- && !across_vertical_ruling \
47
- && prev_char.should_add_space?(char)
48
-
49
- sp = self.new(prev_char.top,
50
- prev_char.right,
51
- prev_char.width_of_space,
52
- prev_char.width_of_space, # width == height for spaces
53
- prev_char.font,
54
- prev_char.font_size,
55
- ' ',
56
- prev_char.width_of_space)
57
- chunks.last << sp
58
- prev_char = sp
59
- end
60
-
61
- # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
62
- # that they ought to be merged by that account.
63
- # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
64
- # Why are both of those `.left`?, you might ask. The intuition is that a letter
65
- # that starts on the left of a vertical ruling ought to remain on the left of it.
66
- if !across_vertical_ruling && prev_char.should_merge?(char)
67
- chunks.last << char
68
- else
69
- # create a new chunk
70
- chunks << TextChunk.create_from_text_element(char)
71
- end
72
- chunks
139
+ sp = nil
73
140
  end
74
- end
75
- end
76
141
 
77
- # more or less returns True if distance < tolerance
78
- def should_merge?(other)
79
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
80
- self.vertically_overlaps?(other) && self.horizontal_distance(other) < width_of_space * (1 + TOLERANCE_FACTOR) && !self.should_add_space?(other)
81
- end
142
+ maxYForLine = [char.bottom, maxYForLine].max
143
+ maxHeightForLine = [maxHeightForLine, char.height].max
144
+ minYTopForLine = [minYTopForLine, char.top].min
145
+
146
+ # if sameLine
147
+ # puts "prev: #{prev_char.text} - char: #{char.text} - diff: #{char.left - prev_char.right} - space: #{[deltaCharWidth, deltaSpace].min} - spacing: #{wordSpacing} - sp: #{!sp.nil?}"
148
+ # else
149
+ # puts
150
+ # end
82
151
 
83
- # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
84
- def should_add_space?(other)
85
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
86
152
 
87
- return false if self.width_of_space.nan?
153
+ dist = (char.left - (sp ? sp.right : prev_char.right))
88
154
 
89
- (self.vertically_overlaps?(other) &&
90
- self.horizontal_distance(other).abs.between?(self.width_of_space * (1 - TOLERANCE_FACTOR), self.width_of_space * (1 + TOLERANCE_FACTOR))) ||
91
- (self.vertical_distance(other) > self.height)
155
+ if !across_vertical_ruling \
156
+ && sameLine \
157
+ && (dist < 0 ? current_chunk.vertically_overlaps?(char) : dist < wordSpacing)
158
+ current_chunk << char
159
+ else
160
+ # create a new chunk
161
+ chunks << TextChunk.create_from_text_element(char)
162
+ end
163
+
164
+ lastWordSpacing = wordSpacing
165
+ previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
166
+
167
+ chunks
168
+ end
92
169
  end
93
170
 
94
171
  ##
95
172
  # merge this TextElement with another (adjust size and text content accordingly)
96
173
  def merge!(other)
97
174
  raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
98
- if self.horizontally_overlaps?(other) and other.top < self.top
175
+
176
+ if (self <=> other) < 0
99
177
  self.text = other.text + self.text
100
178
  else
101
179
  self.text << other.text
@@ -115,16 +193,5 @@ module Tabula
115
193
  self.text.strip == other.text.strip
116
194
  end
117
195
 
118
- # sort in lexicographic (reading) order
119
- def <=>(other)
120
- if self.vertically_overlaps?(other)
121
- self.left <=> other.left
122
- elsif self.top < other.top
123
- -1
124
- else
125
- 1
126
- end
127
- end
128
-
129
196
  end
130
197
  end
@@ -4,6 +4,7 @@ module Tabula
4
4
 
5
5
  class ZoneEntity < java.awt.geom.Rectangle2D::Float
6
6
 
7
+ # TODO used? remove if not.
7
8
  attr_accessor :texts
8
9
 
9
10
  def initialize(top, left, width, height)
@@ -11,6 +12,7 @@ module Tabula
11
12
  if left && top && width && height
12
13
  self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], left, top, width, height
13
14
  end
15
+ # TODO used? remove if not.
14
16
  self.texts = []
15
17
  end
16
18
 
@@ -21,18 +23,21 @@ module Tabula
21
23
  self.height = [self.bottom, other.bottom].max - top
22
24
 
23
25
  self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.left, self.top, self.width, self.height
26
+ self
24
27
  end
25
28
 
26
29
  ##
27
30
  # default sorting order for ZoneEntity objects
28
31
  # is lexicographical (left to right, top to bottom)
29
32
  def <=>(other)
30
- return 1 if self.left > other.left
31
- return -1 if self.left < other.left
32
- return 0 if self.vertically_overlaps?(other)
33
- return 1 if self.top > other.top
34
- return -1 if self.top < other.top
35
- return 0
33
+ yDifference = (self.bottom - other.bottom).abs
34
+ if yDifference < 0.1 ||
35
+ (other.bottom >= self.top && other.bottom <= self.bottom) ||
36
+ (self.bottom >= other.top && self.bottom <= other.bottom)
37
+ self.left <=> other.left
38
+ else
39
+ self.bottom <=> other.bottom
40
+ end
36
41
  end
37
42
 
38
43
  def to_json(options={})
@@ -47,6 +52,10 @@ module Tabula
47
52
  [top, left, bottom, right]
48
53
  end
49
54
 
55
+ def tlwh
56
+ [top, left, width, height]
57
+ end
58
+
50
59
  def points
51
60
  [ Point2D::Float.new(left, top),
52
61
  Point2D::Float.new(right, top),