tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
data/lib/tabula/whitespace.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
require 'algorithms'
|
2
|
-
module Tabula
|
3
|
-
module Whitespace
|
4
|
-
|
5
|
-
# Detect whitespace in a document (not yet used in Tabula)
|
6
|
-
# Described in "Two Geometric Algorithms for layout analysis" (Thomas Breuer)
|
7
|
-
# http://pdf.aminer.org/000/140/219/two_geometric_algorithms_for_layout_analysis.pdf
|
8
|
-
|
9
|
-
def self.find_closest(text_elements, x, y)
|
10
|
-
text_elements.sort_by { |te|
|
11
|
-
Math.sqrt((x - te.midpoint[0]) ** 2 + (y - te.midpoint[1]) ** 2)
|
12
|
-
}.first
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
def self.find_whitespace(text_elements, bounds)
|
17
|
-
queue = Containers::PriorityQueue.new
|
18
|
-
queue.push([bounds, text_elements], bounds.width * bounds.height)
|
19
|
-
rv = []
|
20
|
-
|
21
|
-
|
22
|
-
while !queue.empty?
|
23
|
-
r, obstacles = queue.pop
|
24
|
-
if obstacles.empty?
|
25
|
-
return r
|
26
|
-
end
|
27
|
-
|
28
|
-
pivot = self.find_closest(obstacles, *r.midpoint)
|
29
|
-
|
30
|
-
subrectangles = [
|
31
|
-
ZoneEntity.new(r.top, pivot.right, r.right - pivot.right, pivot.top - r.top),
|
32
|
-
ZoneEntity.new(r.top, r.left, pivot.left - r.left, pivot.top - r.top),
|
33
|
-
ZoneEntity.new(pivot.bottom, r.left, pivot.left - r.left, r.bottom - pivot.bottom),
|
34
|
-
ZoneEntity.new(pivot.bottom, pivot.right, r.right - pivot.right, r.bottom - pivot.bottom)
|
35
|
-
]
|
36
|
-
subrectangles.each do |sub_r|
|
37
|
-
obs = obstacles.select { |s|
|
38
|
-
s.overlaps?(sub_r)
|
39
|
-
}
|
40
|
-
if obs.empty?
|
41
|
-
rv << sub_r
|
42
|
-
else
|
43
|
-
queue.push([sub_r, obs], sub_r.width * sub_r.height)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
return rv
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
data/vertical_rulings_bug.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
require './lib/tabula'
|
2
|
-
|
3
|
-
input_filename = "vertical_rulings_bug.pdf"
|
4
|
-
out = File.new("output.xls", 'w')
|
5
|
-
|
6
|
-
extractor = Tabula::Extraction::CharacterExtractor.new(input_filename, :all) #:all ) # 1..2643
|
7
|
-
extractor.extract.each_with_index do |pdf_page, page_index|
|
8
|
-
|
9
|
-
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(input_filename, page_index))
|
10
|
-
page_areas = [[0, 0, 1000, 1700]]
|
11
|
-
|
12
|
-
scale_factor = pdf_page.width / 1700
|
13
|
-
puts scale_factor
|
14
|
-
|
15
|
-
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Geometry::Segment.new_by_arrays([n * scale_factor, 0], [n * scale_factor, 1000])}
|
16
|
-
|
17
|
-
page_areas.each do |page_area|
|
18
|
-
text = pdf_page.get_text( page_area ) #all the characters within the given area.
|
19
|
-
|
20
|
-
Tabula::Writers.send(:TSV,
|
21
|
-
Tabula.make_table_with_vertical_rulings(text, {:vertical_rulings => vertical_rulings, :merge_words => true, :dontmerge => true}),
|
22
|
-
out)
|
23
|
-
end
|
24
|
-
end
|
25
|
-
out.close
|
26
|
-
|
27
|
-
|
28
|
-
#with dontmerge false (i.e. if we merge) we get crap. STCITY and no spaces in any cities.
|
29
|
-
#with dontmerge true (or commented out), MORGANTOWNWV, and some spaces (e.g. BRYN MAWR, but not FRESHMEADOWS)
|