tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -1,50 +0,0 @@
1
- require 'algorithms'
2
- module Tabula
3
- module Whitespace
4
-
5
- # Detect whitespace in a document (not yet used in Tabula)
6
- # Described in "Two Geometric Algorithms for layout analysis" (Thomas Breuer)
7
- # http://pdf.aminer.org/000/140/219/two_geometric_algorithms_for_layout_analysis.pdf
8
-
9
- def self.find_closest(text_elements, x, y)
10
- text_elements.sort_by { |te|
11
- Math.sqrt((x - te.midpoint[0]) ** 2 + (y - te.midpoint[1]) ** 2)
12
- }.first
13
- end
14
-
15
-
16
- def self.find_whitespace(text_elements, bounds)
17
- queue = Containers::PriorityQueue.new
18
- queue.push([bounds, text_elements], bounds.width * bounds.height)
19
- rv = []
20
-
21
-
22
- while !queue.empty?
23
- r, obstacles = queue.pop
24
- if obstacles.empty?
25
- return r
26
- end
27
-
28
- pivot = self.find_closest(obstacles, *r.midpoint)
29
-
30
- subrectangles = [
31
- ZoneEntity.new(r.top, pivot.right, r.right - pivot.right, pivot.top - r.top),
32
- ZoneEntity.new(r.top, r.left, pivot.left - r.left, pivot.top - r.top),
33
- ZoneEntity.new(pivot.bottom, r.left, pivot.left - r.left, r.bottom - pivot.bottom),
34
- ZoneEntity.new(pivot.bottom, pivot.right, r.right - pivot.right, r.bottom - pivot.bottom)
35
- ]
36
- subrectangles.each do |sub_r|
37
- obs = obstacles.select { |s|
38
- s.overlaps?(sub_r)
39
- }
40
- if obs.empty?
41
- rv << sub_r
42
- else
43
- queue.push([sub_r, obs], sub_r.width * sub_r.height)
44
- end
45
- end
46
- end
47
- return rv
48
- end
49
- end
50
- end
@@ -1,29 +0,0 @@
1
- require './lib/tabula'
2
-
3
- input_filename = "vertical_rulings_bug.pdf"
4
- out = File.new("output.xls", 'w')
5
-
6
- extractor = Tabula::Extraction::CharacterExtractor.new(input_filename, :all) #:all ) # 1..2643
7
- extractor.extract.each_with_index do |pdf_page, page_index|
8
-
9
- lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(input_filename, page_index))
10
- page_areas = [[0, 0, 1000, 1700]]
11
-
12
- scale_factor = pdf_page.width / 1700
13
- puts scale_factor
14
-
15
- vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Geometry::Segment.new_by_arrays([n * scale_factor, 0], [n * scale_factor, 1000])}
16
-
17
- page_areas.each do |page_area|
18
- text = pdf_page.get_text( page_area ) #all the characters within the given area.
19
-
20
- Tabula::Writers.send(:TSV,
21
- Tabula.make_table_with_vertical_rulings(text, {:vertical_rulings => vertical_rulings, :merge_words => true, :dontmerge => true}),
22
- out)
23
- end
24
- end
25
- out.close
26
-
27
-
28
- #with dontmerge false (i.e. if we merge) we get crap. STCITY and no spaces in any cities.
29
- #with dontmerge true (or commented out), MORGANTOWNWV, and some spaces (e.g. BRYN MAWR, but not FRESHMEADOWS)