tabula-extractor 0.7.6-java → 0.8.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/lib/tabula.rb +1 -0
- data/lib/tabula/entities/page.rb +2 -2
- data/lib/tabula/entities/text_chunk.rb +1 -0
- data/lib/tabula/entities/text_element.rb +29 -20
- data/lib/tabula/version.rb +1 -1
- data/target/slf4j-nop-1.7.10.jar +0 -0
- metadata +41 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd31a41b459d191430cf39b8a8c920b5339033e4
|
4
|
+
data.tar.gz: 660bb81f7fc497cdec9550898bd9a895a1d89fb6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38d0dc513c668466cc8f45d102f28003ce4a982ea14dd56b7a71ce2e0de4efd3f237521ea481ea1541a56209ec60249f2eaef33b9be9cede86f37be2a972bed4
|
7
|
+
data.tar.gz: bf4bba7e5817d624e705960cbfc3b9db8ada3229ad4a1e71df4b6b5c82d4245febce6df68e3d3addf2470e00d0342d41ee02b95607358f8aa1bccfc8c67799e2
|
data/README.md
CHANGED
@@ -161,7 +161,14 @@ extractor.extract.each_with_index do |pdf_page, page_index|
|
|
161
161
|
end
|
162
162
|
extractor.close!
|
163
163
|
out.close
|
164
|
-
|
164
|
+
```
|
165
|
+
|
166
|
+
`tabula-extractor` has also been used successfully as a part of data extraction pipelines. [This blog post](http://open.blogs.nytimes.com/2015/04/03/purifying-the-sea-of-pdf-data-automatically/) discusses a possible pattern for creating these and includes a few examples:
|
167
|
+
|
168
|
+
- Sierra Leone’s Ebola situation reports: [GitHub](https://github.com/jeremybmerrill/ebola_parsers/tree/master/sierra_leone)
|
169
|
+
- The NYPD’s CompStat criminal complaints database weekly reports: [GitHub](https://github.com/nytinteractive/compstat_parser)
|
170
|
+
- The NYPD’s monthly reports of moving summonses: [GitHub](https://github.com/nytinteractive/moving_summonses_parser)
|
171
|
+
|
165
172
|
|
166
173
|
## How Does This Work? Like, Theoretically?
|
167
174
|
|
data/lib/tabula.rb
CHANGED
@@ -6,6 +6,7 @@ end
|
|
6
6
|
|
7
7
|
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
8
8
|
require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
|
9
|
+
require File.join(File.dirname(__FILE__), '../target/', 'slf4j-nop-1.7.10.jar')
|
9
10
|
require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
|
10
11
|
require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
|
11
12
|
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -67,11 +67,11 @@ module Tabula
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def get_min_char_width
|
70
|
-
@min_char_width ||= texts.map(&:width).min
|
70
|
+
@min_char_width ||= texts.map(&:width).min || ::Float::INFINITY
|
71
71
|
end
|
72
72
|
|
73
73
|
def get_min_char_height
|
74
|
-
@min_char_height ||= texts.map(&:height).min
|
74
|
+
@min_char_height ||= texts.map(&:height).min || ::Float::INFINITY
|
75
75
|
end
|
76
76
|
|
77
77
|
def get_area(area)
|
@@ -46,6 +46,7 @@ module Tabula
|
|
46
46
|
# returns a list of column boundaries (x axis)
|
47
47
|
# +lines+ must be an array of lines sorted by their +top+ attribute
|
48
48
|
def self.column_positions(lines)
|
49
|
+
return [] if lines.empty?
|
49
50
|
init = lines.first.text_elements.inject([]) { |memo, text_chunk|
|
50
51
|
next memo if text_chunk.text =~ ONLY_SPACES_RE
|
51
52
|
memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
|
@@ -43,23 +43,25 @@ module Tabula
|
|
43
43
|
text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
|
44
44
|
|
45
45
|
|
46
|
-
previousAveCharWidth = text_chunks.first.width
|
47
46
|
endOfLastTextX = text_chunks.first.right
|
48
47
|
maxYForLine = text_chunks.first.bottom
|
49
48
|
maxHeightForLine = text_chunks.first.height
|
50
49
|
minYTopForLine = text_chunks.first.top
|
51
|
-
lastWordSpacing = -1
|
52
50
|
sp = nil
|
53
51
|
|
52
|
+
char_widths_so_far = []
|
53
|
+
word_spacings_so_far = []
|
54
|
+
|
54
55
|
text_elements.inject(text_chunks) do |chunks, char|
|
55
56
|
|
56
57
|
current_chunk = chunks.last
|
57
58
|
prev_char = current_chunk.text_elements.last
|
58
59
|
|
59
|
-
# Resets the
|
60
|
+
# Resets the character/spacing widths (used for averages) when we see a change in font
|
60
61
|
# or a change in the font size
|
61
62
|
if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
|
62
|
-
|
63
|
+
char_widths_so_far = []
|
64
|
+
word_spacings_so_far = []
|
63
65
|
end
|
64
66
|
|
65
67
|
# if same char AND overlapped, skip
|
@@ -78,27 +80,25 @@ module Tabula
|
|
78
80
|
}
|
79
81
|
|
80
82
|
# Estimate the expected width of the space based on the
|
81
|
-
# space character with some margin
|
83
|
+
# average width of the space character with some margin
|
82
84
|
wordSpacing = char.width_of_space
|
83
85
|
deltaSpace = 0
|
84
86
|
deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
|
85
87
|
::Float::MAX
|
86
|
-
elsif
|
88
|
+
elsif word_spacings_so_far.empty?
|
87
89
|
wordSpacing * 0.5 # 0.5 == spacingTolerance
|
88
90
|
else
|
89
|
-
((
|
91
|
+
(word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5
|
90
92
|
end
|
91
93
|
|
94
|
+
word_spacings_so_far << wordSpacing
|
95
|
+
char_widths_so_far << (char.width / char.text.size)
|
96
|
+
|
92
97
|
# Estimate the expected width of the space based on the
|
93
|
-
# average character width with some margin.
|
94
|
-
# make a true average (average of averages) but we found that it gave the
|
95
|
-
# best results after numerous experiments. Based on experiments we also found that
|
98
|
+
# average character width with some margin. Based on experiments we also found that
|
96
99
|
# .3 worked well.
|
97
|
-
averageCharWidth =
|
98
|
-
|
99
|
-
else
|
100
|
-
(previousAveCharWidth + (char.width / char.text.size)) / 2.0
|
101
|
-
end
|
100
|
+
averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size
|
101
|
+
|
102
102
|
deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
|
103
103
|
|
104
104
|
# Compares the values obtained by the average method and the wordSpacing method and picks
|
@@ -119,7 +119,19 @@ module Tabula
|
|
119
119
|
sameLine = false
|
120
120
|
end
|
121
121
|
|
122
|
-
|
122
|
+
# characters tend to be ordered by their left location
|
123
|
+
# in determining whether to add a space, we need to know the distance
|
124
|
+
# between the current character's left and the nearest character's
|
125
|
+
# right. The nearest character may not be the previous character, so we
|
126
|
+
# need to keep track of the character with the greatest right x-axis
|
127
|
+
# location -- that's endOfLastTextX
|
128
|
+
# (in some fonts, one character may be completely "on top of"
|
129
|
+
# another character, with the wider character starting to the left and
|
130
|
+
# ending to the right of the narrower character, e.g. ANSI
|
131
|
+
# representations of some South Asian languages, see
|
132
|
+
# https://github.com/tabulapdf/tabula/issues/303)
|
133
|
+
endOfLastTextX = [char.right, endOfLastTextX].max
|
134
|
+
|
123
135
|
# should we add a space?
|
124
136
|
if !across_vertical_ruling \
|
125
137
|
&& sameLine \
|
@@ -161,11 +173,8 @@ module Tabula
|
|
161
173
|
chunks << TextChunk.create_from_text_element(char)
|
162
174
|
end
|
163
175
|
|
164
|
-
lastWordSpacing = wordSpacing
|
165
|
-
previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
|
166
|
-
|
167
176
|
chunks
|
168
|
-
end
|
177
|
+
end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } }
|
169
178
|
end
|
170
179
|
|
171
180
|
##
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,87 +1,87 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
8
8
|
- Jeremy B. Merill
|
9
9
|
- Mike Tigas
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2015-
|
13
|
+
date: 2015-08-20 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
17
|
-
version_requirements: !ruby/object:Gem::Requirement
|
18
|
-
requirements:
|
19
|
-
- - '>='
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: 1.3.4
|
22
17
|
requirement: !ruby/object:Gem::Requirement
|
23
18
|
requirements:
|
24
|
-
- -
|
19
|
+
- - ">="
|
25
20
|
- !ruby/object:Gem::Version
|
26
21
|
version: 1.3.4
|
27
|
-
prerelease: false
|
28
22
|
type: :development
|
29
|
-
|
30
|
-
name: ruby-debug
|
23
|
+
prerelease: false
|
31
24
|
version_requirements: !ruby/object:Gem::Requirement
|
32
25
|
requirements:
|
33
|
-
- -
|
26
|
+
- - ">="
|
34
27
|
- !ruby/object:Gem::Version
|
35
|
-
version:
|
28
|
+
version: 1.3.4
|
29
|
+
- !ruby/object:Gem::Dependency
|
30
|
+
name: ruby-debug
|
36
31
|
requirement: !ruby/object:Gem::Requirement
|
37
32
|
requirements:
|
38
|
-
- -
|
33
|
+
- - ">="
|
39
34
|
- !ruby/object:Gem::Version
|
40
35
|
version: '0'
|
41
|
-
prerelease: false
|
42
36
|
type: :development
|
43
|
-
|
44
|
-
name: pry
|
37
|
+
prerelease: false
|
45
38
|
version_requirements: !ruby/object:Gem::Requirement
|
46
39
|
requirements:
|
47
|
-
- -
|
40
|
+
- - ">="
|
48
41
|
- !ruby/object:Gem::Version
|
49
42
|
version: '0'
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: pry
|
50
45
|
requirement: !ruby/object:Gem::Requirement
|
51
46
|
requirements:
|
52
|
-
- -
|
47
|
+
- - ">="
|
53
48
|
- !ruby/object:Gem::Version
|
54
49
|
version: '0'
|
55
|
-
prerelease: false
|
56
50
|
type: :development
|
57
|
-
|
58
|
-
name: minitest
|
51
|
+
prerelease: false
|
59
52
|
version_requirements: !ruby/object:Gem::Requirement
|
60
53
|
requirements:
|
61
|
-
- -
|
54
|
+
- - ">="
|
62
55
|
- !ruby/object:Gem::Version
|
63
56
|
version: '0'
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: minitest
|
64
59
|
requirement: !ruby/object:Gem::Requirement
|
65
60
|
requirements:
|
66
|
-
- -
|
61
|
+
- - ">="
|
67
62
|
- !ruby/object:Gem::Version
|
68
63
|
version: '0'
|
69
|
-
prerelease: false
|
70
64
|
type: :development
|
71
|
-
|
72
|
-
name: trollop
|
65
|
+
prerelease: false
|
73
66
|
version_requirements: !ruby/object:Gem::Requirement
|
74
67
|
requirements:
|
75
|
-
- -
|
68
|
+
- - ">="
|
76
69
|
- !ruby/object:Gem::Version
|
77
|
-
version: '
|
70
|
+
version: '0'
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: trollop
|
78
73
|
requirement: !ruby/object:Gem::Requirement
|
79
74
|
requirements:
|
80
|
-
- - ~>
|
75
|
+
- - "~>"
|
81
76
|
- !ruby/object:Gem::Version
|
82
77
|
version: '2.0'
|
83
|
-
prerelease: false
|
84
78
|
type: :runtime
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - "~>"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '2.0'
|
85
85
|
description: extract tables from PDF files
|
86
86
|
email:
|
87
87
|
- manuel@jazzido.com
|
@@ -90,8 +90,8 @@ executables:
|
|
90
90
|
extensions: []
|
91
91
|
extra_rdoc_files: []
|
92
92
|
files:
|
93
|
-
- .gitignore
|
94
|
-
- .travis.yml
|
93
|
+
- ".gitignore"
|
94
|
+
- ".travis.yml"
|
95
95
|
- AUTHORS.md
|
96
96
|
- Gemfile
|
97
97
|
- LICENSE.md
|
@@ -125,29 +125,30 @@ files:
|
|
125
125
|
- target/jsi-1.1.0-SNAPSHOT.jar
|
126
126
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
127
127
|
- target/slf4j-api-1.6.3.jar
|
128
|
+
- target/slf4j-nop-1.7.10.jar
|
128
129
|
- target/trove4j-3.0.3.jar
|
129
130
|
homepage: https://github.com/jazzido/tabula-extractor
|
130
131
|
licenses:
|
131
132
|
- MIT
|
132
133
|
metadata: {}
|
133
|
-
post_install_message:
|
134
|
+
post_install_message:
|
134
135
|
rdoc_options: []
|
135
136
|
require_paths:
|
136
137
|
- lib
|
137
138
|
required_ruby_version: !ruby/object:Gem::Requirement
|
138
139
|
requirements:
|
139
|
-
- -
|
140
|
+
- - ">="
|
140
141
|
- !ruby/object:Gem::Version
|
141
142
|
version: '0'
|
142
143
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
144
|
requirements:
|
144
|
-
- -
|
145
|
+
- - ">="
|
145
146
|
- !ruby/object:Gem::Version
|
146
147
|
version: '0'
|
147
148
|
requirements: []
|
148
|
-
rubyforge_project:
|
149
|
-
rubygems_version: 2.
|
150
|
-
signing_key:
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 2.4.5
|
151
|
+
signing_key:
|
151
152
|
specification_version: 4
|
152
153
|
summary: extract tables from PDF files
|
153
154
|
test_files: []
|