tabula-extractor 0.7.2-java → 0.7.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
@@ -0,0 +1,42 @@
1
+ module Tabula
2
+ module AbstractInterface
3
+
4
+ class InterfaceNotImplementedError < NoMethodError
5
+ end
6
+
7
+ def self.included(klass)
8
+ klass.send(:include, AbstractInterface::Methods)
9
+ klass.send(:extend, AbstractInterface::Methods)
10
+ end
11
+
12
+ module Methods
13
+ def api_not_implemented(klass)
14
+ caller.first.match(/in \`(.+)\'/)
15
+ method_name = $1
16
+ raise AbstractInterface::InterfaceNotImplementedError.new("#{klass.class.name} needs to implement '#{method_name}' for interface #{self.name}!")
17
+ end
18
+ end
19
+ end
20
+
21
+
22
+ module Tabular
23
+ include AbstractInterface
24
+ # this is a pseudo-interface as described here:
25
+ # http://metabates.com/2011/02/07/building-interfaces-and-abstract-classes-in-ruby/
26
+ # Table and Spreadsheet implement this interface, so should any class
27
+ # intended to represent tabular data from a PDF, e.g. if another extraction
28
+ # method were created, so that Tabula GUI and API can correctly handle
29
+ # its data.
30
+
31
+ def extraction_method; raise Tabular.api_not_implemented(self); end
32
+
33
+ def page; Tabular.api_not_implemented(self); end
34
+ def rows; Tabular.api_not_implemented(self); end
35
+ def cols; Tabular.api_not_implemented(self); end
36
+
37
+ def to_csv; Tabular.api_not_implemented(self); end
38
+ def to_tsv; Tabular.api_not_implemented(self); end
39
+ def to_a; Tabular.api_not_implemented(self); end
40
+ def to_json; Tabular.api_not_implemented(self); end
41
+ end
42
+ end
@@ -8,46 +8,71 @@ module Tabula
8
8
  # initialize a new TextChunk from a TextElement
9
9
  def self.create_from_text_element(text_element)
10
10
  raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
11
- tc = self.new(text_element.top, text_element.left, text_element.width, text_element.height)
11
+ tc = self.new(*text_element.tlwh)
12
12
  tc.text_elements = [text_element]
13
13
  return tc
14
14
  end
15
15
 
16
- ##
17
- # group an iterable of TextChunk into a list of Line
18
16
  def self.group_by_lines(text_chunks)
19
- lines = []
20
- text_chunks.each do |te|
21
- next if te.text =~ ONLY_SPACES_RE
22
- l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
23
- if l.nil?
24
- l = Line.new
25
- lines << l
17
+ bbwidth = text_chunks.max_by(&:right).right - text_chunks.min_by(&:left).left
18
+
19
+ l = Line.new
20
+ l << text_chunks.first
21
+
22
+ lines = text_chunks[1..-1].inject([l]) do |lines, te|
23
+ if lines.last.horizontal_overlap_ratio(te) < 0.01
24
+ # skip lines such that:
25
+ # - are wider than the 90% of the width of the text_chunks bounding box
26
+ # - it contains a single repeated character
27
+ if lines.last.width / bbwidth > 0.9 \
28
+ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE }
29
+ lines.pop
30
+ end
31
+ lines << Line.new
26
32
  end
27
- l << te
33
+ lines.last << te
34
+ lines
28
35
  end
29
- lines
36
+
37
+ if lines.last.width / bbwidth > 0.9 \
38
+ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE }
39
+ lines.pop
40
+ end
41
+
42
+ lines.map!(&:remove_sequential_spaces!)
30
43
  end
31
44
 
32
45
  ##
33
- # calculate estimated columns from an iterable of TextChunk
34
- def self.column_positions(top, text_chunks)
35
- right = 0
36
- columns = []
37
-
38
- text_chunks.each do |te|
39
- next if te.text =~ ONLY_SPACES_RE
40
- if te.top >= top
41
- left = te.left
42
- if (left > right)
43
- columns << right
44
- right = te.right
45
- elsif te.right > right
46
- right = te.right
46
+ # returns a list of column boundaries (x axis)
47
+ # +lines+ must be an array of lines sorted by their +top+ attribute
48
+ def self.column_positions(lines)
49
+ init = lines.first.text_elements.inject([]) { |memo, text_chunk|
50
+ next memo if text_chunk.text =~ ONLY_SPACES_RE
51
+ memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
52
+ memo
53
+ }
54
+
55
+ regions = lines[1..-1]
56
+ .inject(init) do |column_regions, line|
57
+
58
+ line_text_elements = line.text_elements.clone.select { |te| te.text !~ ONLY_SPACES_RE }
59
+
60
+ column_regions.each do |cr|
61
+
62
+ overlaps = line_text_elements
63
+ .select { |te| te.text !~ ONLY_SPACES_RE && cr.horizontally_overlaps?(te) }
64
+
65
+ overlaps.inject(cr) do |memo, te|
66
+ cr.merge!(te)
47
67
  end
68
+
69
+ line_text_elements = line_text_elements - overlaps
48
70
  end
71
+
72
+ column_regions += line_text_elements.map { |te| Tabula::ZoneEntity.new(*te.tlwh) }
49
73
  end
50
- columns
74
+
75
+ regions.map { |r| r.right.round(2) }.uniq
51
76
  end
52
77
 
53
78
  ##
@@ -59,10 +84,10 @@ module Tabula
59
84
 
60
85
  def merge!(other)
61
86
  if other.instance_of?(TextChunk)
62
- if self.horizontally_overlaps?(other) && other.top < self.top
63
- self.text_elements = other.text_elements + self.text_elements
64
- else
87
+ if (self <=> other) < 0
65
88
  self.text_elements = self.text_elements + other.text_elements
89
+ else
90
+ self.text_elements = other.text_elements + self.text_elements
66
91
  end
67
92
  end
68
93
  super(other)
@@ -75,28 +100,6 @@ module Tabula
75
100
  raise "Not Implemented"
76
101
  end
77
102
 
78
- ##
79
- # remove leading and trailing whitespace
80
- # (changes geometry accordingly)
81
- # TODO horrible implementation - fix.
82
- def strip!
83
- acc = 0
84
- new_te = self.text_elements.drop_while { |te|
85
- te.text == ' ' && acc += 1
86
- }
87
- self.left += self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
88
- self.text_elements = new_te
89
-
90
- self.text_elements.reverse!
91
- acc = 0
92
- new_te = self.text_elements.drop_while { |te|
93
- te.text == ' ' && acc += 1
94
- }
95
- self.right -= self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
96
- self.text_elements = new_te.reverse
97
- self
98
- end
99
-
100
103
  def text
101
104
  self.text_elements.map(&:text).join
102
105
  end
@@ -1,4 +1,6 @@
1
+ # -*- coding: utf-8 -*-
1
2
  module Tabula
3
+
2
4
  ##
3
5
  # a Glyph
4
6
  class TextElement < ZoneEntity
@@ -17,8 +19,20 @@ module Tabula
17
19
 
18
20
  EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
19
21
 
22
+ def self.within(first, second, variance )
23
+ second < first + variance && second > first - variance
24
+ end
25
+
26
+ def self.overlap(y1, height1, y2, height2, variance=0.1)
27
+ within( y1, y2, variance) || (y2 <= y1 && y2 >= y1 - height1) \
28
+ || (y1 <= y2 && y1 >= y2-height2)
29
+ end
30
+
31
+
20
32
  ##
21
33
  # heuristically merge an iterable of TextElement into a list of TextChunk
34
+ # lots of ideas taken from PDFBox's PDFTextStripper.writePage
35
+ # here be dragons
22
36
  def self.merge_words(text_elements, options={})
23
37
  default_options = {:vertical_rulings => []}
24
38
  options = default_options.merge(options)
@@ -28,74 +42,138 @@ module Tabula
28
42
 
29
43
  text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
30
44
 
45
+
46
+ previousAveCharWidth = text_chunks.first.width
47
+ endOfLastTextX = text_chunks.first.right
48
+ maxYForLine = text_chunks.first.bottom
49
+ maxHeightForLine = text_chunks.first.height
50
+ minYTopForLine = text_chunks.first.top
51
+ lastWordSpacing = -1
52
+ sp = nil
53
+
31
54
  text_elements.inject(text_chunks) do |chunks, char|
55
+
32
56
  current_chunk = chunks.last
33
57
  prev_char = current_chunk.text_elements.last
34
58
 
59
+ # Resets the average character width when we see a change in font
60
+ # or a change in the font size
61
+ if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
62
+ previousAveCharWidth = -1;
63
+ end
64
+
35
65
  # if same char AND overlapped, skip
36
- if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
37
- chunks
66
+ if (prev_char.text == char.text) && prev_char.overlaps_with_ratio?(char, 0.5)
67
+ next chunks
68
+ end
69
+
70
+ # if char is a space that overlaps with the prev_char, skip
71
+ if char.text == ' ' && prev_char.left == char.left && prev_char.top == char.top
72
+ next chunks
73
+ end
74
+
75
+ # any vertical ruling goes across prev_char and char?
76
+ across_vertical_ruling = vertical_ruling_locations.any? { |loc|
77
+ prev_char.left < loc && char.left > loc
78
+ }
79
+
80
+ # Estimate the expected width of the space based on the
81
+ # space character with some margin.
82
+ wordSpacing = char.width_of_space
83
+ deltaSpace = 0
84
+ deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
85
+ ::Float::MAX
86
+ elsif lastWordSpacing < 0
87
+ wordSpacing * 0.5 # 0.5 == spacingTolerance
88
+ else
89
+ ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
90
+ end
91
+
92
+ # Estimate the expected width of the space based on the
93
+ # average character width with some margin. This calculation does not
94
+ # make a true average (average of averages) but we found that it gave the
95
+ # best results after numerous experiments. Based on experiments we also found that
96
+ # .3 worked well.
97
+ averageCharWidth = if previousAveCharWidth < 0
98
+ char.width / char.text.size
99
+ else
100
+ (previousAveCharWidth + (char.width / char.text.size)) / 2.0
101
+ end
102
+ deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
103
+
104
+ # Compares the values obtained by the average method and the wordSpacing method and picks
105
+ # the smaller number.
106
+ expectedStartOfNextWordX = -::Float::MAX
107
+
108
+ if endOfLastTextX != -1
109
+ expectedStartOfNextWordX = endOfLastTextX + [deltaCharWidth, deltaSpace].min
110
+ end
111
+
112
+ sameLine = true
113
+ if !overlap(char.bottom, char.height, maxYForLine, maxHeightForLine)
114
+ endOfLastTextX = -1
115
+ expectedStartOfNextWordX = -::Float::MAX
116
+ maxYForLine = -::Float::MAX
117
+ maxHeightForLine = -1
118
+ minYTopForLine = ::Float::MAX
119
+ sameLine = false
120
+ end
121
+
122
+ endOfLastTextX = char.right
123
+ # should we add a space?
124
+ if !across_vertical_ruling \
125
+ && sameLine \
126
+ && expectedStartOfNextWordX < char.left \
127
+ && !prev_char.text.end_with?(' ')
128
+
129
+ sp = self.new(prev_char.top,
130
+ prev_char.right,
131
+ expectedStartOfNextWordX - prev_char.right,
132
+ prev_char.height,
133
+ prev_char.font,
134
+ prev_char.font_size,
135
+ ' ',
136
+ prev_char.width_of_space)
137
+ current_chunk << sp
38
138
  else
39
- # any vertical ruling goes across prev_char and char?
40
- across_vertical_ruling = vertical_ruling_locations.any? { |loc|
41
- prev_char.left < loc && char.left > loc
42
- }
43
-
44
- # should we add a space?
45
- if (prev_char.text != " ") && (char.text != " ") \
46
- && !across_vertical_ruling \
47
- && prev_char.should_add_space?(char)
48
-
49
- sp = self.new(prev_char.top,
50
- prev_char.right,
51
- prev_char.width_of_space,
52
- prev_char.width_of_space, # width == height for spaces
53
- prev_char.font,
54
- prev_char.font_size,
55
- ' ',
56
- prev_char.width_of_space)
57
- chunks.last << sp
58
- prev_char = sp
59
- end
60
-
61
- # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
62
- # that they ought to be merged by that account.
63
- # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
64
- # Why are both of those `.left`?, you might ask. The intuition is that a letter
65
- # that starts on the left of a vertical ruling ought to remain on the left of it.
66
- if !across_vertical_ruling && prev_char.should_merge?(char)
67
- chunks.last << char
68
- else
69
- # create a new chunk
70
- chunks << TextChunk.create_from_text_element(char)
71
- end
72
- chunks
139
+ sp = nil
73
140
  end
74
- end
75
- end
76
141
 
77
- # more or less returns True if distance < tolerance
78
- def should_merge?(other)
79
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
80
- self.vertically_overlaps?(other) && self.horizontal_distance(other) < width_of_space * (1 + TOLERANCE_FACTOR) && !self.should_add_space?(other)
81
- end
142
+ maxYForLine = [char.bottom, maxYForLine].max
143
+ maxHeightForLine = [maxHeightForLine, char.height].max
144
+ minYTopForLine = [minYTopForLine, char.top].min
145
+
146
+ # if sameLine
147
+ # puts "prev: #{prev_char.text} - char: #{char.text} - diff: #{char.left - prev_char.right} - space: #{[deltaCharWidth, deltaSpace].min} - spacing: #{wordSpacing} - sp: #{!sp.nil?}"
148
+ # else
149
+ # puts
150
+ # end
82
151
 
83
- # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
84
- def should_add_space?(other)
85
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
86
152
 
87
- return false if self.width_of_space.nan?
153
+ dist = (char.left - (sp ? sp.right : prev_char.right))
88
154
 
89
- (self.vertically_overlaps?(other) &&
90
- self.horizontal_distance(other).abs.between?(self.width_of_space * (1 - TOLERANCE_FACTOR), self.width_of_space * (1 + TOLERANCE_FACTOR))) ||
91
- (self.vertical_distance(other) > self.height)
155
+ if !across_vertical_ruling \
156
+ && sameLine \
157
+ && (dist < 0 ? current_chunk.vertically_overlaps?(char) : dist < wordSpacing)
158
+ current_chunk << char
159
+ else
160
+ # create a new chunk
161
+ chunks << TextChunk.create_from_text_element(char)
162
+ end
163
+
164
+ lastWordSpacing = wordSpacing
165
+ previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
166
+
167
+ chunks
168
+ end
92
169
  end
93
170
 
94
171
  ##
95
172
  # merge this TextElement with another (adjust size and text content accordingly)
96
173
  def merge!(other)
97
174
  raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
98
- if self.horizontally_overlaps?(other) and other.top < self.top
175
+
176
+ if (self <=> other) < 0
99
177
  self.text = other.text + self.text
100
178
  else
101
179
  self.text << other.text
@@ -115,16 +193,5 @@ module Tabula
115
193
  self.text.strip == other.text.strip
116
194
  end
117
195
 
118
- # sort in lexicographic (reading) order
119
- def <=>(other)
120
- if self.vertically_overlaps?(other)
121
- self.left <=> other.left
122
- elsif self.top < other.top
123
- -1
124
- else
125
- 1
126
- end
127
- end
128
-
129
196
  end
130
197
  end
@@ -4,6 +4,7 @@ module Tabula
4
4
 
5
5
  class ZoneEntity < java.awt.geom.Rectangle2D::Float
6
6
 
7
+ # TODO used? remove if not.
7
8
  attr_accessor :texts
8
9
 
9
10
  def initialize(top, left, width, height)
@@ -11,6 +12,7 @@ module Tabula
11
12
  if left && top && width && height
12
13
  self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], left, top, width, height
13
14
  end
15
+ # TODO used? remove if not.
14
16
  self.texts = []
15
17
  end
16
18
 
@@ -21,18 +23,21 @@ module Tabula
21
23
  self.height = [self.bottom, other.bottom].max - top
22
24
 
23
25
  self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.left, self.top, self.width, self.height
26
+ self
24
27
  end
25
28
 
26
29
  ##
27
30
  # default sorting order for ZoneEntity objects
28
31
  # is lexicographical (left to right, top to bottom)
29
32
  def <=>(other)
30
- return 1 if self.left > other.left
31
- return -1 if self.left < other.left
32
- return 0 if self.vertically_overlaps?(other)
33
- return 1 if self.top > other.top
34
- return -1 if self.top < other.top
35
- return 0
33
+ yDifference = (self.bottom - other.bottom).abs
34
+ if yDifference < 0.1 ||
35
+ (other.bottom >= self.top && other.bottom <= self.bottom) ||
36
+ (self.bottom >= other.top && self.bottom <= other.bottom)
37
+ self.left <=> other.left
38
+ else
39
+ self.bottom <=> other.bottom
40
+ end
36
41
  end
37
42
 
38
43
  def to_json(options={})
@@ -47,6 +52,10 @@ module Tabula
47
52
  [top, left, bottom, right]
48
53
  end
49
54
 
55
+ def tlwh
56
+ [top, left, width, height]
57
+ end
58
+
50
59
  def points
51
60
  [ Point2D::Float.new(left, top),
52
61
  Point2D::Float.new(right, top),