tabula-extractor 0.7.2-java → 0.7.4-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
4
- data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
3
+ metadata.gz: 935de0f0dc43fa388a86cc091dc540b74b6ce31f
4
+ data.tar.gz: 67fa5fda6450c3b1659af3c61c8027843be5c082
5
5
  SHA512:
6
- metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
7
- data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
6
+ metadata.gz: 191054f79148535bf359c81c72d35b717f71f97ee3c3bedd4c2af66e4332afb98f3071afe4c9ed9e894586e3a20722769742f17fc02b9a5d5d954a4fae50803d
7
+ data.tar.gz: 711f993194c402d1bca016f0fe13ccaeb8e4eafc6b67c2de0fa8b3cef1e7e3ae5b4cdefc2b251b64467747e7af26f80bb54bf57d4424ea50bb2dd26db7e27570
data/.gitignore CHANGED
@@ -12,6 +12,7 @@ rdoc
12
12
  spec/reports
13
13
  test/tmp
14
14
  test/version_tmp
15
+ test/data/icdar-groundtruth
15
16
  tmp
16
17
  /*.pdf
17
18
  /*.csv
data/README.md CHANGED
@@ -7,7 +7,7 @@ Extract tables from PDF files. `tabula-extractor` is the table extraction engine
7
7
 
8
8
  ## Installation
9
9
 
10
- At the moment, `tabula-extractor` only works with JRuby. [Install JRuby](http://jruby.org/getting-started) and run
10
+ `tabula-extractor` only works with JRuby 1.7 or newer. [Install JRuby](http://jruby.org/getting-started) and run
11
11
 
12
12
  ``
13
13
  jruby -S gem install tabula-extractor
@@ -57,12 +57,12 @@ Here's a very basic example:
57
57
 
58
58
  ````ruby
59
59
  require 'tabula'
60
-
60
+
61
61
  pdf_file_path = "whatever.pdf"
62
62
  outfilename = "whatever.csv"
63
-
63
+
64
64
  out = open(outfilename, 'w')
65
-
65
+
66
66
  extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
67
67
  extractor.extract.each do |pdf_page|
68
68
  pdf_page.spreadsheets.each do |spreadsheet|
@@ -73,7 +73,3 @@ end
73
73
  out.close
74
74
 
75
75
  ````
76
-
77
- ## Notes
78
-
79
- `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
data/bin/tabula CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env jruby
1
+ #!/usr/bin/env jruby -J-Djava.awt.headless=true
2
2
  # encoding: utf-8
3
3
  require 'trollop'
4
4
  require_relative '../lib/tabula'
@@ -9,7 +9,7 @@ def parse_pages_arg(pages_arg)
9
9
  if(pages_arg == 'all')
10
10
  return :all
11
11
  end
12
-
12
+
13
13
  ranges = pages_arg.split(',').map(&:strip)
14
14
  pages = []
15
15
  ranges.each do |range|
@@ -100,7 +100,7 @@ def main
100
100
  else
101
101
  false
102
102
  end
103
-
103
+
104
104
  extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
105
105
  extractor.extract.each_with_index do |pdf_page, page_index|
106
106
 
data/lib/tabula.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  module Tabula
2
2
  PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
3
3
  ONLY_SPACES_RE = Regexp.new('^\s+$')
4
+ SAME_CHAR_RE = Regexp.new('^(.)\1+$')
4
5
  end
5
6
 
6
7
  require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
@@ -8,7 +9,6 @@ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
8
9
  require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
9
10
  require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
10
11
 
11
-
12
12
  import 'java.util.logging.LogManager'
13
13
  import 'java.util.logging.Level'
14
14
 
@@ -22,13 +22,17 @@ lm.logger_names.each do |name|
22
22
  end
23
23
  end
24
24
  end
25
-
26
-
27
25
  require_relative './tabula/version'
28
26
  require_relative './tabula/core_ext'
27
+
29
28
  require_relative './tabula/entities'
30
29
  require_relative './tabula/extraction'
31
30
  require_relative './tabula/table_extractor'
32
31
  require_relative './tabula/writers'
33
- require_relative './tabula/line_segment_detector'
34
- require_relative './tabula/pdf_render'
32
+
33
+ module Tabula
34
+ autoload :LSD , File.expand_path('tabula/line_segment_detector.rb', File.dirname(__FILE__))
35
+ autoload :Render , File.expand_path('tabula/pdf_render.rb', File.dirname(__FILE__))
36
+ end
37
+
38
+ require_relative './tabula/table_extractor'
@@ -1,3 +1,4 @@
1
+ require_relative './entities/tabular'
1
2
  require_relative './entities/zone_entity'
2
3
  require_relative './entities/cell'
3
4
  require_relative './entities/has_cells'
@@ -15,7 +15,7 @@ module Tabula
15
15
  @placeholder = false
16
16
  @spanning = false
17
17
  @text_elements = []
18
- @options = ({:use_line_returns => false, :cell_debug => NORMAL}).merge options
18
+ @options = ({:use_line_returns => true, :cell_debug => NORMAL}).merge options
19
19
  end
20
20
 
21
21
  def self.new_from_points(topleft, bottomright, options={})
@@ -29,11 +29,13 @@ module Tabula
29
29
  output = ""
30
30
  text_elements.sort #use the default sort for ZoneEntity
31
31
  text_elements.group_by(&:top).values.each do |row|
32
- output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\n" : '')
33
- end
32
+ output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\r" : '')
33
+ # per @bchartoff, https://github.com/jazzido/tabula-extractor/pull/65#issuecomment-32899336
34
+ # line returns as \r behave better in Excel.
35
+ end
34
36
  if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
35
37
  text_output = output.dup
36
- output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
38
+ output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
37
39
  output += " \n #{text_output}"
38
40
  end
39
41
  output.strip
@@ -6,27 +6,30 @@ module Tabula
6
6
  # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
7
7
  module HasCells
8
8
 
9
- ANOTHER_MAGIC_NUMBER = 0.75
9
+ ARBITRARY_MAGIC_HEURISTIC_NUMBER = 0.65
10
10
 
11
11
  def is_tabular?
12
+ ratio = heuristic_ratio
13
+ return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
14
+ end
15
+
16
+ def heuristic_ratio
12
17
  #spreadsheet extraction
13
18
  spreadsheet = spreadsheets.first
14
- return false if spreadsheet.nil?
19
+ return Float::NAN if spreadsheet.nil?
15
20
  rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
16
21
  columns_defined_by_lines = spreadsheet.cols.size
17
22
 
18
23
  table = self.get_table
19
24
  columns_defined_without_lines = table.cols.size
20
25
  rows_defined_without_lines = table.rows.size
21
- ratio = ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
22
-
23
- return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
26
+ ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
24
27
  end
25
28
 
26
29
  # finds cells from the ruling lines on the page.
27
30
  # implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
28
31
  # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
29
- def find_cells!(options={})
32
+ def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
30
33
  # All lines need to been sorted from up to down,
31
34
  # and left to right in ascending order
32
35
 
@@ -39,9 +42,10 @@ module Tabula
39
42
  # depending on the Point2D default sort here.
40
43
  intersection_points_array = intersection_points.keys.sort
41
44
 
42
- intersection_points.each_with_index do |(topLeft, ((horizontal, vertical))), i|
45
+ intersection_points_array.each_with_index do |topLeft, i|
43
46
  # Fetch all points on the same vertical and horizontal
44
47
  # line with current crossing point
48
+ horizontal, vertical = intersection_points[topLeft]
45
49
 
46
50
  # this lets us go to the next intersection_point in intersection_points_array
47
51
  # it is bad and I feel bad.
@@ -64,19 +68,19 @@ module Tabula
64
68
  # point;
65
69
  next unless horizontal.colinear?(y_point)
66
70
  #Hypothetical bottom right point of rectangle
67
- btmRight = Point2D::Float.new( y_point.x, x_point.y )
71
+ btmRight = Point2D::Float.new(y_point.x, x_point.y)
68
72
  if intersection_points.include?(btmRight)
69
- intersection_points[btmRight].each do |btmRightHorizontal, btmRightVertical|
70
- if btmRightHorizontal.colinear?( x_point ) &&
73
+ btmRightHorizontal, btmRightVertical = intersection_points[btmRight]
74
+
75
+ if btmRightHorizontal.colinear?( x_point ) &&
71
76
  btmRightVertical.colinear?( y_point )
72
- # Rectangle is confirmed to have 4 sides
73
- cellsFound << Cell.new_from_points( topLeft, btmRight, options)
74
- # Each crossing point can be the top left corner
75
- # of only a single rectangle
76
- #next crossing-point; we need to "next" out of the outer loop here
77
- # to avoid creating non-minimal cells, I htink.
78
- throw :cellCreated
79
- end
77
+ # Rectangle is confirmed to have 4 sides
78
+ cellsFound << Cell.new_from_points( topLeft, btmRight, options)
79
+ # Each crossing point can be the top left corner
80
+ # of only a single rectangle
81
+ #next crossing-point; we need to "next" out of the outer loop here
82
+ # to avoid creating non-minimal cells, I htink.
83
+ throw :cellCreated
80
84
  end
81
85
  end
82
86
  end
@@ -87,66 +91,6 @@ module Tabula
87
91
  cellsFound
88
92
  end
89
93
 
90
- #############################
91
- # Chapter 2, Spanning Cells #
92
- #############################
93
- #if c is a "spanning cell", that is
94
- # if there are N>0 vertical lines strictly between this cell's left and right
95
- #insert N placeholder cells after it with zero size (but same top)
96
-
97
- # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
98
- def add_spanning_cells!
99
- #rounding: because Cell.new_from_points, using in #find_cells above, has
100
- # a float precision error where, for instance, a cell whose x2 coord is
101
- # supposed to be 160.137451171875 comes out as 160.13745498657227 because
102
- # of minus. :(
103
- vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
104
- horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
105
-
106
- cells.each do |c|
107
- vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
108
- horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
109
-
110
- unless vertical_rulings_spanned_over.empty?
111
- c.spanning = true
112
- vertical_rulings_spanned_over.each do |spanned_over_line_loc|
113
- placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
114
- placeholder.placeholder = true
115
- cells << placeholder
116
- end
117
- end
118
- unless horizontal_rulings_spanned_over.empty?
119
- c.spanning = true
120
- horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
121
- placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
122
- placeholder.placeholder = true
123
- cells << placeholder
124
- end
125
- end
126
-
127
- #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
128
- # e.g. -------------------
129
- # | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
130
- # |-----------------|
131
- # | C | C | C | C |
132
- # |-----------------|
133
- # | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
134
- # |---- + ----| P is a "placeholder" cell with either zero width or zero height
135
- # | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
136
- # |---- + ----| C is an ordinary cell.
137
- # | C | P DP | C |
138
- # |-----------------|
139
-
140
- unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
141
- double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
142
- placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
143
- placeholder.placeholder = true
144
- cells << placeholder
145
- end
146
- end
147
- end
148
- end
149
-
150
94
  #TODO:
151
95
  #returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
152
96
  #maybe placeholders should be added after cells is split into spreadsheets
@@ -3,6 +3,8 @@ module Tabula
3
3
  attr_accessor :text_elements
4
4
  attr_reader :index
5
5
 
6
+ SPACE_RUN_MAX_LENGTH = 3
7
+
6
8
  def initialize(index=nil)
7
9
  @text_elements = []
8
10
  @index = index
@@ -16,15 +18,59 @@ module Tabula
16
18
  self.width = t.width
17
19
  self.height = t.height
18
20
  else
19
- if in_same_column = @text_elements.find { |te| te.horizontally_overlaps?(t) }
20
- in_same_column.merge!(t)
21
- else
22
- self.text_elements << t
23
- self.merge!(t)
24
- end
21
+ self.text_elements << t
22
+ self.merge!(t)
25
23
  end
26
24
  end
27
25
 
26
+ ##
27
+ # remove runs of the space char longer than SPACE_RUN_MAX_LENGTH
28
+ # should not change dimensions of the container +Line+
29
+ def remove_sequential_spaces!(seq_spaces_count=SPACE_RUN_MAX_LENGTH)
30
+ self.text_elements = self.text_elements.reduce([]) do |memo, text_chunk|
31
+ long_space_runs = text_chunk
32
+ .text_elements
33
+ .chunk { |te| te.text == ' '} # detect runs of spaces...
34
+ .select { |is_space, text_elements| # ...longer than SPACE_RUN_MAX_LENGTH
35
+ is_space && !text_elements.nil? && text_elements.size >= SPACE_RUN_MAX_LENGTH
36
+ }
37
+ .map { |_, text_elements| text_elements }
38
+
39
+ # no long runs of spaces
40
+ # keep as it was and end iteration
41
+ if long_space_runs.empty?
42
+ memo << text_chunk
43
+ next memo
44
+ end
45
+
46
+ ranges = long_space_runs.map { |lsr|
47
+ idx = text_chunk
48
+ .text_elements
49
+ .index { |te| te.equal?(lsr.first) } # we need pointer comparison here
50
+ (idx)..(idx+lsr.size-1)
51
+ }
52
+
53
+ in_run = false
54
+ new_chunk = true
55
+ text_chunk
56
+ .text_elements
57
+ .each_with_index do |te, i|
58
+ if ranges.any? { |r| r.include?(i) } # te belongs to a run of spaces, skip
59
+ in_run = true
60
+ else
61
+ if in_run || new_chunk
62
+ memo << TextChunk.create_from_text_element(te)
63
+ else
64
+ memo.last << te
65
+ end
66
+ in_run = new_chunk = false
67
+ end
68
+ end
69
+ memo
70
+ end # reduce
71
+ self
72
+ end
73
+
28
74
  #used for testing, ignores text element stuff besides stripped text.
29
75
  def ==(other)
30
76
  return false if other.nil?
@@ -6,7 +6,7 @@ module Tabula
6
6
  attr_writer :min_char_width, :min_char_height
7
7
  attr_accessor :cells
8
8
 
9
- def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
9
+ def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
10
10
  super(0, 0, width, height)
11
11
  @rotation = rotation
12
12
  if number < 1
@@ -19,10 +19,16 @@ module Tabula
19
19
  @spreadsheets = nil
20
20
  @min_char_width = min_char_width
21
21
  @min_char_height = min_char_height
22
- @spatial_index = TextElementIndex.new
23
22
 
24
23
  self.texts = texts
25
- self.texts.each { |te| @spatial_index << te }
24
+
25
+ if spatial_index.nil?
26
+ @spatial_index = TextElementIndex.new
27
+ self.texts.each { |te| @spatial_index << te }
28
+ else
29
+ @spatial_index = spatial_index
30
+ end
31
+
26
32
  end
27
33
 
28
34
  def min_char_width
@@ -49,7 +55,8 @@ module Tabula
49
55
  texts,
50
56
  Ruling.crop_rulings_to_area(@ruling_lines, area),
51
57
  texts.map(&:width).min,
52
- texts.map(&:height).min)
58
+ texts.map(&:height).min,
59
+ @spatial_index)
53
60
  return page_area
54
61
  end
55
62
 
@@ -60,28 +67,33 @@ module Tabula
60
67
  return Tabula::Table.new(0, [])
61
68
  end
62
69
 
63
- text_chunks = TextElement.merge_words(self.texts.sort, options).sort
70
+ texts = self.texts.sort
71
+ text_chunks = TextElement.merge_words(texts, options)
64
72
 
65
- lines = TextChunk.group_by_lines(text_chunks)
73
+ lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top)
66
74
 
67
- unless options[:vertical_rulings].empty?
68
- columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
69
- separators = columns.sort.reverse
70
- else
71
- columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
72
- text_chunks)
73
- separators = columns[1..-1].sort.reverse
74
- end
75
+ columns = unless options[:vertical_rulings].empty?
76
+ options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
77
+ else
78
+ TextChunk.column_positions(lines).sort
79
+ end
75
80
 
76
- table = Table.new(lines.count, separators)
81
+ table = Table.new(lines.count, columns)
77
82
  lines.each_with_index do |line, i|
78
- line.text_elements.each do |te|
79
- j = separators.find_index { |s| te.left > s } || separators.count
80
- table.add_text_element(te, i, separators.count - j)
83
+ line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
84
+ j = columns.find_index { |s| te.left <= s } || columns.count
85
+ table.add_text_element(te, i, j)
81
86
  end
82
87
  end
83
88
 
84
- table.lstrip_lines!
89
+ # fixes up the table a little bit, replacing nils with empty TextElements
90
+ # and sorting the lines.
91
+ # table.rows.each do |l|
92
+ # l.text_elements = l.text_elements.map do |te|
93
+ # te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
94
+ # end
95
+ # end
96
+ # table.rows.sort_by!(&:top)
85
97
  table
86
98
  end
87
99
 
@@ -96,7 +108,7 @@ module Tabula
96
108
  return @spreadsheets
97
109
  end
98
110
  get_ruling_lines!(options)
99
- self.find_cells!(options)
111
+ self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)
100
112
 
101
113
  spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
102
114
 
@@ -157,14 +169,18 @@ module Tabula
157
169
 
158
170
  #returns ruling lines, memoizes them in
159
171
  def get_ruling_lines!(options={})
160
- if !@ruling_lines.nil? && !@ruling_lines.empty?
161
- self.snap_points!
162
- @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
163
- @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
164
- @vertical_ruling_lines + @horizontal_ruling_lines
165
- else
166
- []
172
+ if @ruling_lines.nil? || @ruling_lines.empty?
173
+ return []
167
174
  end
175
+ self.snap_points!
176
+
177
+ @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }
178
+
179
+ @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
180
+ @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
181
+
182
+ @vertical_ruling_lines + @horizontal_ruling_lines
183
+
168
184
  end
169
185
 
170
186
  ##
@@ -252,29 +268,6 @@ module Tabula
252
268
  l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
253
269
  end
254
270
  end
255
-
256
- def collapse_oriented_rulings(lines)
257
- # lines must all be of one orientation (i.e. horizontal, vertical)
258
-
259
- if lines.empty?
260
- return []
261
- end
262
-
263
- lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
264
-
265
- lines = lines.inject([lines.shift]) do |memo, next_line|
266
- last = memo.last
267
- if next_line.position == last.position && last.nearlyIntersects?(next_line)
268
- memo.last.start = next_line.start < last.start ? next_line.start : last.start
269
- memo.last.end = next_line.end < last.end ? last.end : next_line.end
270
- memo
271
- elsif next_line.length == 0
272
- memo
273
- else
274
- memo << next_line
275
- end
276
- end
277
- end
278
271
  end
279
272
 
280
273
  end