tabula-extractor 0.7.2-java → 0.7.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
4
- data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
3
+ metadata.gz: 935de0f0dc43fa388a86cc091dc540b74b6ce31f
4
+ data.tar.gz: 67fa5fda6450c3b1659af3c61c8027843be5c082
5
5
  SHA512:
6
- metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
7
- data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
6
+ metadata.gz: 191054f79148535bf359c81c72d35b717f71f97ee3c3bedd4c2af66e4332afb98f3071afe4c9ed9e894586e3a20722769742f17fc02b9a5d5d954a4fae50803d
7
+ data.tar.gz: 711f993194c402d1bca016f0fe13ccaeb8e4eafc6b67c2de0fa8b3cef1e7e3ae5b4cdefc2b251b64467747e7af26f80bb54bf57d4424ea50bb2dd26db7e27570
data/.gitignore CHANGED
@@ -12,6 +12,7 @@ rdoc
12
12
  spec/reports
13
13
  test/tmp
14
14
  test/version_tmp
15
+ test/data/icdar-groundtruth
15
16
  tmp
16
17
  /*.pdf
17
18
  /*.csv
data/README.md CHANGED
@@ -7,7 +7,7 @@ Extract tables from PDF files. `tabula-extractor` is the table extraction engine
7
7
 
8
8
  ## Installation
9
9
 
10
- At the moment, `tabula-extractor` only works with JRuby. [Install JRuby](http://jruby.org/getting-started) and run
10
+ `tabula-extractor` only works with JRuby 1.7 or newer. [Install JRuby](http://jruby.org/getting-started) and run
11
11
 
12
12
  ``
13
13
  jruby -S gem install tabula-extractor
@@ -57,12 +57,12 @@ Here's a very basic example:
57
57
 
58
58
  ````ruby
59
59
  require 'tabula'
60
-
60
+
61
61
  pdf_file_path = "whatever.pdf"
62
62
  outfilename = "whatever.csv"
63
-
63
+
64
64
  out = open(outfilename, 'w')
65
-
65
+
66
66
  extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
67
67
  extractor.extract.each do |pdf_page|
68
68
  pdf_page.spreadsheets.each do |spreadsheet|
@@ -73,7 +73,3 @@ end
73
73
  out.close
74
74
 
75
75
  ````
76
-
77
- ## Notes
78
-
79
- `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
data/bin/tabula CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env jruby
1
+ #!/usr/bin/env jruby -J-Djava.awt.headless=true
2
2
  # encoding: utf-8
3
3
  require 'trollop'
4
4
  require_relative '../lib/tabula'
@@ -9,7 +9,7 @@ def parse_pages_arg(pages_arg)
9
9
  if(pages_arg == 'all')
10
10
  return :all
11
11
  end
12
-
12
+
13
13
  ranges = pages_arg.split(',').map(&:strip)
14
14
  pages = []
15
15
  ranges.each do |range|
@@ -100,7 +100,7 @@ def main
100
100
  else
101
101
  false
102
102
  end
103
-
103
+
104
104
  extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
105
105
  extractor.extract.each_with_index do |pdf_page, page_index|
106
106
 
data/lib/tabula.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  module Tabula
2
2
  PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
3
3
  ONLY_SPACES_RE = Regexp.new('^\s+$')
4
+ SAME_CHAR_RE = Regexp.new('^(.)\1+$')
4
5
  end
5
6
 
6
7
  require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
@@ -8,7 +9,6 @@ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
8
9
  require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
9
10
  require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
10
11
 
11
-
12
12
  import 'java.util.logging.LogManager'
13
13
  import 'java.util.logging.Level'
14
14
 
@@ -22,13 +22,17 @@ lm.logger_names.each do |name|
22
22
  end
23
23
  end
24
24
  end
25
-
26
-
27
25
  require_relative './tabula/version'
28
26
  require_relative './tabula/core_ext'
27
+
29
28
  require_relative './tabula/entities'
30
29
  require_relative './tabula/extraction'
31
30
  require_relative './tabula/table_extractor'
32
31
  require_relative './tabula/writers'
33
- require_relative './tabula/line_segment_detector'
34
- require_relative './tabula/pdf_render'
32
+
33
+ module Tabula
34
+ autoload :LSD , File.expand_path('tabula/line_segment_detector.rb', File.dirname(__FILE__))
35
+ autoload :Render , File.expand_path('tabula/pdf_render.rb', File.dirname(__FILE__))
36
+ end
37
+
38
+ require_relative './tabula/table_extractor'
@@ -1,3 +1,4 @@
1
+ require_relative './entities/tabular'
1
2
  require_relative './entities/zone_entity'
2
3
  require_relative './entities/cell'
3
4
  require_relative './entities/has_cells'
@@ -15,7 +15,7 @@ module Tabula
15
15
  @placeholder = false
16
16
  @spanning = false
17
17
  @text_elements = []
18
- @options = ({:use_line_returns => false, :cell_debug => NORMAL}).merge options
18
+ @options = ({:use_line_returns => true, :cell_debug => NORMAL}).merge options
19
19
  end
20
20
 
21
21
  def self.new_from_points(topleft, bottomright, options={})
@@ -29,11 +29,13 @@ module Tabula
29
29
  output = ""
30
30
  text_elements.sort #use the default sort for ZoneEntity
31
31
  text_elements.group_by(&:top).values.each do |row|
32
- output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\n" : '')
33
- end
32
+ output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\r" : '')
33
+ # per @bchartoff, https://github.com/jazzido/tabula-extractor/pull/65#issuecomment-32899336
34
+ # line returns as \r behave better in Excel.
35
+ end
34
36
  if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
35
37
  text_output = output.dup
36
- output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
38
+ output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
37
39
  output += " \n #{text_output}"
38
40
  end
39
41
  output.strip
@@ -6,27 +6,30 @@ module Tabula
6
6
  # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
7
7
  module HasCells
8
8
 
9
- ANOTHER_MAGIC_NUMBER = 0.75
9
+ ARBITRARY_MAGIC_HEURISTIC_NUMBER = 0.65
10
10
 
11
11
  def is_tabular?
12
+ ratio = heuristic_ratio
13
+ return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
14
+ end
15
+
16
+ def heuristic_ratio
12
17
  #spreadsheet extraction
13
18
  spreadsheet = spreadsheets.first
14
- return false if spreadsheet.nil?
19
+ return Float::NAN if spreadsheet.nil?
15
20
  rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
16
21
  columns_defined_by_lines = spreadsheet.cols.size
17
22
 
18
23
  table = self.get_table
19
24
  columns_defined_without_lines = table.cols.size
20
25
  rows_defined_without_lines = table.rows.size
21
- ratio = ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
22
-
23
- return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
26
+ ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
24
27
  end
25
28
 
26
29
  # finds cells from the ruling lines on the page.
27
30
  # implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
28
31
  # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
29
- def find_cells!(options={})
32
+ def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
30
33
  # All lines need to been sorted from up to down,
31
34
  # and left to right in ascending order
32
35
 
@@ -39,9 +42,10 @@ module Tabula
39
42
  # depending on the Point2D default sort here.
40
43
  intersection_points_array = intersection_points.keys.sort
41
44
 
42
- intersection_points.each_with_index do |(topLeft, ((horizontal, vertical))), i|
45
+ intersection_points_array.each_with_index do |topLeft, i|
43
46
  # Fetch all points on the same vertical and horizontal
44
47
  # line with current crossing point
48
+ horizontal, vertical = intersection_points[topLeft]
45
49
 
46
50
  # this lets us go to the next intersection_point in intersection_points_array
47
51
  # it is bad and I feel bad.
@@ -64,19 +68,19 @@ module Tabula
64
68
  # point;
65
69
  next unless horizontal.colinear?(y_point)
66
70
  #Hypothetical bottom right point of rectangle
67
- btmRight = Point2D::Float.new( y_point.x, x_point.y )
71
+ btmRight = Point2D::Float.new(y_point.x, x_point.y)
68
72
  if intersection_points.include?(btmRight)
69
- intersection_points[btmRight].each do |btmRightHorizontal, btmRightVertical|
70
- if btmRightHorizontal.colinear?( x_point ) &&
73
+ btmRightHorizontal, btmRightVertical = intersection_points[btmRight]
74
+
75
+ if btmRightHorizontal.colinear?( x_point ) &&
71
76
  btmRightVertical.colinear?( y_point )
72
- # Rectangle is confirmed to have 4 sides
73
- cellsFound << Cell.new_from_points( topLeft, btmRight, options)
74
- # Each crossing point can be the top left corner
75
- # of only a single rectangle
76
- #next crossing-point; we need to "next" out of the outer loop here
77
- # to avoid creating non-minimal cells, I htink.
78
- throw :cellCreated
79
- end
77
+ # Rectangle is confirmed to have 4 sides
78
+ cellsFound << Cell.new_from_points( topLeft, btmRight, options)
79
+ # Each crossing point can be the top left corner
80
+ # of only a single rectangle
81
+ #next crossing-point; we need to "next" out of the outer loop here
82
+ # to avoid creating non-minimal cells, I htink.
83
+ throw :cellCreated
80
84
  end
81
85
  end
82
86
  end
@@ -87,66 +91,6 @@ module Tabula
87
91
  cellsFound
88
92
  end
89
93
 
90
- #############################
91
- # Chapter 2, Spanning Cells #
92
- #############################
93
- #if c is a "spanning cell", that is
94
- # if there are N>0 vertical lines strictly between this cell's left and right
95
- #insert N placeholder cells after it with zero size (but same top)
96
-
97
- # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
98
- def add_spanning_cells!
99
- #rounding: because Cell.new_from_points, using in #find_cells above, has
100
- # a float precision error where, for instance, a cell whose x2 coord is
101
- # supposed to be 160.137451171875 comes out as 160.13745498657227 because
102
- # of minus. :(
103
- vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
104
- horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
105
-
106
- cells.each do |c|
107
- vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
108
- horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
109
-
110
- unless vertical_rulings_spanned_over.empty?
111
- c.spanning = true
112
- vertical_rulings_spanned_over.each do |spanned_over_line_loc|
113
- placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
114
- placeholder.placeholder = true
115
- cells << placeholder
116
- end
117
- end
118
- unless horizontal_rulings_spanned_over.empty?
119
- c.spanning = true
120
- horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
121
- placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
122
- placeholder.placeholder = true
123
- cells << placeholder
124
- end
125
- end
126
-
127
- #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
128
- # e.g. -------------------
129
- # | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
130
- # |-----------------|
131
- # | C | C | C | C |
132
- # |-----------------|
133
- # | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
134
- # |---- + ----| P is a "placeholder" cell with either zero width or zero height
135
- # | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
136
- # |---- + ----| C is an ordinary cell.
137
- # | C | P DP | C |
138
- # |-----------------|
139
-
140
- unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
141
- double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
142
- placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
143
- placeholder.placeholder = true
144
- cells << placeholder
145
- end
146
- end
147
- end
148
- end
149
-
150
94
  #TODO:
151
95
  #returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
152
96
  #maybe placeholders should be added after cells is split into spreadsheets
@@ -3,6 +3,8 @@ module Tabula
3
3
  attr_accessor :text_elements
4
4
  attr_reader :index
5
5
 
6
+ SPACE_RUN_MAX_LENGTH = 3
7
+
6
8
  def initialize(index=nil)
7
9
  @text_elements = []
8
10
  @index = index
@@ -16,15 +18,59 @@ module Tabula
16
18
  self.width = t.width
17
19
  self.height = t.height
18
20
  else
19
- if in_same_column = @text_elements.find { |te| te.horizontally_overlaps?(t) }
20
- in_same_column.merge!(t)
21
- else
22
- self.text_elements << t
23
- self.merge!(t)
24
- end
21
+ self.text_elements << t
22
+ self.merge!(t)
25
23
  end
26
24
  end
27
25
 
26
+ ##
27
+ # remove runs of the space char longer than SPACE_RUN_MAX_LENGTH
28
+ # should not change dimensions of the container +Line+
29
+ def remove_sequential_spaces!(seq_spaces_count=SPACE_RUN_MAX_LENGTH)
30
+ self.text_elements = self.text_elements.reduce([]) do |memo, text_chunk|
31
+ long_space_runs = text_chunk
32
+ .text_elements
33
+ .chunk { |te| te.text == ' '} # detect runs of spaces...
34
+ .select { |is_space, text_elements| # ...longer than SPACE_RUN_MAX_LENGTH
35
+ is_space && !text_elements.nil? && text_elements.size >= SPACE_RUN_MAX_LENGTH
36
+ }
37
+ .map { |_, text_elements| text_elements }
38
+
39
+ # no long runs of spaces
40
+ # keep as it was and end iteration
41
+ if long_space_runs.empty?
42
+ memo << text_chunk
43
+ next memo
44
+ end
45
+
46
+ ranges = long_space_runs.map { |lsr|
47
+ idx = text_chunk
48
+ .text_elements
49
+ .index { |te| te.equal?(lsr.first) } # we need pointer comparison here
50
+ (idx)..(idx+lsr.size-1)
51
+ }
52
+
53
+ in_run = false
54
+ new_chunk = true
55
+ text_chunk
56
+ .text_elements
57
+ .each_with_index do |te, i|
58
+ if ranges.any? { |r| r.include?(i) } # te belongs to a run of spaces, skip
59
+ in_run = true
60
+ else
61
+ if in_run || new_chunk
62
+ memo << TextChunk.create_from_text_element(te)
63
+ else
64
+ memo.last << te
65
+ end
66
+ in_run = new_chunk = false
67
+ end
68
+ end
69
+ memo
70
+ end # reduce
71
+ self
72
+ end
73
+
28
74
  #used for testing, ignores text element stuff besides stripped text.
29
75
  def ==(other)
30
76
  return false if other.nil?
@@ -6,7 +6,7 @@ module Tabula
6
6
  attr_writer :min_char_width, :min_char_height
7
7
  attr_accessor :cells
8
8
 
9
- def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
9
+ def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
10
10
  super(0, 0, width, height)
11
11
  @rotation = rotation
12
12
  if number < 1
@@ -19,10 +19,16 @@ module Tabula
19
19
  @spreadsheets = nil
20
20
  @min_char_width = min_char_width
21
21
  @min_char_height = min_char_height
22
- @spatial_index = TextElementIndex.new
23
22
 
24
23
  self.texts = texts
25
- self.texts.each { |te| @spatial_index << te }
24
+
25
+ if spatial_index.nil?
26
+ @spatial_index = TextElementIndex.new
27
+ self.texts.each { |te| @spatial_index << te }
28
+ else
29
+ @spatial_index = spatial_index
30
+ end
31
+
26
32
  end
27
33
 
28
34
  def min_char_width
@@ -49,7 +55,8 @@ module Tabula
49
55
  texts,
50
56
  Ruling.crop_rulings_to_area(@ruling_lines, area),
51
57
  texts.map(&:width).min,
52
- texts.map(&:height).min)
58
+ texts.map(&:height).min,
59
+ @spatial_index)
53
60
  return page_area
54
61
  end
55
62
 
@@ -60,28 +67,33 @@ module Tabula
60
67
  return Tabula::Table.new(0, [])
61
68
  end
62
69
 
63
- text_chunks = TextElement.merge_words(self.texts.sort, options).sort
70
+ texts = self.texts.sort
71
+ text_chunks = TextElement.merge_words(texts, options)
64
72
 
65
- lines = TextChunk.group_by_lines(text_chunks)
73
+ lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top)
66
74
 
67
- unless options[:vertical_rulings].empty?
68
- columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
69
- separators = columns.sort.reverse
70
- else
71
- columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
72
- text_chunks)
73
- separators = columns[1..-1].sort.reverse
74
- end
75
+ columns = unless options[:vertical_rulings].empty?
76
+ options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
77
+ else
78
+ TextChunk.column_positions(lines).sort
79
+ end
75
80
 
76
- table = Table.new(lines.count, separators)
81
+ table = Table.new(lines.count, columns)
77
82
  lines.each_with_index do |line, i|
78
- line.text_elements.each do |te|
79
- j = separators.find_index { |s| te.left > s } || separators.count
80
- table.add_text_element(te, i, separators.count - j)
83
+ line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
84
+ j = columns.find_index { |s| te.left <= s } || columns.count
85
+ table.add_text_element(te, i, j)
81
86
  end
82
87
  end
83
88
 
84
- table.lstrip_lines!
89
+ # fixes up the table a little bit, replacing nils with empty TextElements
90
+ # and sorting the lines.
91
+ # table.rows.each do |l|
92
+ # l.text_elements = l.text_elements.map do |te|
93
+ # te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
94
+ # end
95
+ # end
96
+ # table.rows.sort_by!(&:top)
85
97
  table
86
98
  end
87
99
 
@@ -96,7 +108,7 @@ module Tabula
96
108
  return @spreadsheets
97
109
  end
98
110
  get_ruling_lines!(options)
99
- self.find_cells!(options)
111
+ self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)
100
112
 
101
113
  spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
102
114
 
@@ -157,14 +169,18 @@ module Tabula
157
169
 
158
170
  #returns ruling lines, memoizes them in
159
171
  def get_ruling_lines!(options={})
160
- if !@ruling_lines.nil? && !@ruling_lines.empty?
161
- self.snap_points!
162
- @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
163
- @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
164
- @vertical_ruling_lines + @horizontal_ruling_lines
165
- else
166
- []
172
+ if @ruling_lines.nil? || @ruling_lines.empty?
173
+ return []
167
174
  end
175
+ self.snap_points!
176
+
177
+ @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }
178
+
179
+ @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
180
+ @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
181
+
182
+ @vertical_ruling_lines + @horizontal_ruling_lines
183
+
168
184
  end
169
185
 
170
186
  ##
@@ -252,29 +268,6 @@ module Tabula
252
268
  l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
253
269
  end
254
270
  end
255
-
256
- def collapse_oriented_rulings(lines)
257
- # lines must all be of one orientation (i.e. horizontal, vertical)
258
-
259
- if lines.empty?
260
- return []
261
- end
262
-
263
- lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
264
-
265
- lines = lines.inject([lines.shift]) do |memo, next_line|
266
- last = memo.last
267
- if next_line.position == last.position && last.nearlyIntersects?(next_line)
268
- memo.last.start = next_line.start < last.start ? next_line.start : last.start
269
- memo.last.end = next_line.end < last.end ? last.end : next_line.end
270
- memo
271
- elsif next_line.length == 0
272
- memo
273
- else
274
- memo << next_line
275
- end
276
- end
277
- end
278
271
  end
279
272
 
280
273
  end