tabula-extractor 0.6.5-java → 0.6.6-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -232,6 +232,10 @@ module Tabula
232
232
  self.height = t.height
233
233
  else
234
234
  if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
235
+ #sometimes a space needs to be added here
236
+ unless in_same_column.vertically_overlaps?(t)
237
+ t.text = " " + t.text
238
+ end
235
239
  in_same_column.merge!(t)
236
240
  else
237
241
  self.text_elements << t
@@ -14,7 +14,15 @@ module Tabula
14
14
  def initialize(text_elements, options = {})
15
15
  self.text_elements = text_elements
16
16
  self.options = DEFAULT_OPTIONS.merge(options)
17
- merge_words! if self.options[:merge_words]
17
+
18
+ if self.options[:merge_words]
19
+ if self.options[:vertical_rulings]
20
+ merge_words_in_a_vertical_rulings_aware_manner!(self.options[:vertical_rulings])
21
+ else
22
+ merge_words!
23
+ end
24
+ end
25
+
18
26
  end
19
27
 
20
28
  def get_rows
@@ -49,7 +57,7 @@ module Tabula
49
57
  if column = columns.detect { |c| te.horizontally_overlaps?(c) }
50
58
  column << te
51
59
  else
52
- puts "couldn't find a place for #{te.inspect}"
60
+ #puts "couldn't find a place for #{te.inspect}"
53
61
  #columns << Column.new(te.left, te.width, [te])
54
62
  end
55
63
  end
@@ -134,21 +142,69 @@ module Tabula
134
142
  self.text_elements.compact!
135
143
  return self.text_elements
136
144
  end
145
+
146
+ #this is where spaces come from!
147
+ def merge_words_in_a_vertical_rulings_aware_manner!(vertical_rulings)
148
+ #don't merge words across a ruling.
149
+
150
+ return self.text_elements if @merged # only merge once. awful hack.
151
+ @merged = true
152
+ current_word_index = i = 0
153
+ char1 = self.text_elements[i]
154
+ vertical_ruling_locations = vertical_rulings.map &:left
155
+
156
+ while i < self.text_elements.size-1 do
157
+
158
+ char2 = self.text_elements[i+1]
159
+
160
+ next if char2.nil? or char1.nil?
161
+
162
+ if self.text_elements[current_word_index].should_merge?(char2)
163
+ unless vertical_ruling_locations.map{|loc| self.text_elements[current_word_index].left < loc && char2.left > loc}.include?(true)
164
+ self.text_elements[current_word_index].merge!(char2)
165
+ end
166
+
167
+ char1 = char2
168
+ self.text_elements[i+1] = nil
169
+ else
170
+ # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
171
+ if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
172
+ self.text_elements[current_word_index].text += " "
173
+ #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
174
+ end
175
+ current_word_index = i+1
176
+ end
177
+ i += 1
178
+ end
179
+ self.text_elements.compact!
180
+ return self.text_elements
181
+ end
137
182
  end
138
183
 
139
- # TODO next four module methods are deprecated
184
+ ##
185
+ # Deprecated.
186
+ ##
140
187
  def Tabula.group_by_columns(text_elements, merge_words=false)
141
188
  TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
142
189
  end
143
190
 
191
+ ##
192
+ # Deprecated.
193
+ ##
144
194
  def Tabula.get_line_boundaries(text_elements)
145
195
  TableExtractor.new(text_elements).get_line_boundaries
146
196
  end
147
197
 
198
+ ##
199
+ # Deprecated.
200
+ ##
148
201
  def Tabula.get_columns(text_elements, merge_words=true)
149
202
  TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
150
203
  end
151
204
 
205
+ ##
206
+ # Deprecated.
207
+ ##
152
208
  def Tabula.get_rows(text_elements, merge_words=true)
153
209
  TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
154
210
  end
@@ -256,7 +312,7 @@ module Tabula
256
312
  vertical_rulings = options[:vertical_rulings]
257
313
  columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
258
314
 
259
- # insert empty cells if needed
315
+ # insert an empty cell in a given column if there's no text elements within that column's boundaries
260
316
  lines.each_with_index do |l, line_index|
261
317
  next if l.text_elements.nil?
262
318
  l.text_elements.compact! # TODO WHY do I have to do this?
@@ -271,26 +327,28 @@ module Tabula
271
327
  end
272
328
 
273
329
  # merge elements that are in the same column
274
- lines.each_with_index do |l, line_index|
275
- next if l.text_elements.nil?
276
-
277
- (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
278
- next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
279
-
280
- # if same column...
281
- if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
282
- == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
283
- if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
284
- l.text_elements[t1].merge!(l.text_elements[t2])
285
- l.text_elements[t2] = nil
286
- else
287
- l.text_elements[t2].merge!(l.text_elements[t1])
288
- l.text_elements[t1] = nil
330
+ unless options[:dontmerge]
331
+ lines.each_with_index do |l, line_index|
332
+ next if l.text_elements.nil?
333
+
334
+ (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
335
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
336
+
337
+ # if same column...
338
+ if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
339
+ == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
340
+ if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
341
+ l.text_elements[t1].merge!(l.text_elements[t2])
342
+ l.text_elements[t2] = nil
343
+ else
344
+ l.text_elements[t2].merge!(l.text_elements[t1])
345
+ l.text_elements[t1] = nil
346
+ end
289
347
  end
290
348
  end
291
- end
292
349
 
293
- l.text_elements.compact!
350
+ l.text_elements.compact!
351
+ end
294
352
  end
295
353
 
296
354
  # remove duplicate lines
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.5'
2
+ VERSION = '0.6.6'
3
3
  end
Binary file
@@ -0,0 +1,29 @@
1
+ require './lib/tabula'
2
+
3
+ input_filename = "vertical_rulings_bug.pdf"
4
+ out = File.new("output.xls", 'w')
5
+
6
+ extractor = Tabula::Extraction::CharacterExtractor.new(input_filename, :all) #:all ) # 1..2643
7
+ extractor.extract.each_with_index do |pdf_page, page_index|
8
+
9
+ lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(input_filename, page_index))
10
+ page_areas = [[0, 0, 1000, 1700]]
11
+
12
+ scale_factor = pdf_page.width / 1700
13
+ puts scale_factor
14
+
15
+ vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Geometry::Segment.new_by_arrays([n * scale_factor, 0], [n * scale_factor, 1000])}
16
+
17
+ page_areas.each do |page_area|
18
+ text = pdf_page.get_text( page_area ) #all the characters within the given area.
19
+
20
+ Tabula::Writers.send(:TSV,
21
+ Tabula.make_table_with_vertical_rulings(text, {:vertical_rulings => vertical_rulings, :merge_words => true, :dontmerge => true}),
22
+ out)
23
+ end
24
+ end
25
+ out.close
26
+
27
+
28
+ #with dontmerge false (i.e. if we merge) we get crap. STCITY and no spaces in any cities.
29
+ #with dontmerge true (or commented out), MORGANTOWNWV, and some spaces (e.g. BRYN MAWR, but not FRESHMEADOWS)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.5
5
+ version: 0.6.6
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-07-23 00:00:00.000000000 Z
14
+ date: 2013-08-23 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
@@ -132,6 +132,8 @@ files:
132
132
  - test/data/gre.pdf
133
133
  - test/data/tabla_subsidios.pdf
134
134
  - test/tests.rb
135
+ - vertical_rulings_bug.pdf
136
+ - vertical_rulings_bug.rb
135
137
  homepage: https://github.com/jazzido/tabula-extractor
136
138
  licenses:
137
139
  - MIT