tabula-extractor 0.7.6-java → 0.8.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/lib/tabula.rb +1 -0
- data/lib/tabula/entities/page.rb +2 -2
- data/lib/tabula/entities/text_chunk.rb +1 -0
- data/lib/tabula/entities/text_element.rb +29 -20
- data/lib/tabula/version.rb +1 -1
- data/target/slf4j-nop-1.7.10.jar +0 -0
- metadata +41 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd31a41b459d191430cf39b8a8c920b5339033e4
|
4
|
+
data.tar.gz: 660bb81f7fc497cdec9550898bd9a895a1d89fb6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38d0dc513c668466cc8f45d102f28003ce4a982ea14dd56b7a71ce2e0de4efd3f237521ea481ea1541a56209ec60249f2eaef33b9be9cede86f37be2a972bed4
|
7
|
+
data.tar.gz: bf4bba7e5817d624e705960cbfc3b9db8ada3229ad4a1e71df4b6b5c82d4245febce6df68e3d3addf2470e00d0342d41ee02b95607358f8aa1bccfc8c67799e2
|
data/README.md
CHANGED
@@ -161,7 +161,14 @@ extractor.extract.each_with_index do |pdf_page, page_index|
|
|
161
161
|
end
|
162
162
|
extractor.close!
|
163
163
|
out.close
|
164
|
-
|
164
|
+
```
|
165
|
+
|
166
|
+
`tabula-extractor` has also been used successfully as a part of data extraction pipelines. [This blog post](http://open.blogs.nytimes.com/2015/04/03/purifying-the-sea-of-pdf-data-automatically/) discusses a possible pattern for creating these and includes a few examples:
|
167
|
+
|
168
|
+
- Sierra Leone’s Ebola situation reports: [GitHub](https://github.com/jeremybmerrill/ebola_parsers/tree/master/sierra_leone)
|
169
|
+
- The NYPD’s CompStat criminal complaints database weekly reports: [GitHub](https://github.com/nytinteractive/compstat_parser)
|
170
|
+
- The NYPD’s monthly reports of moving summonses: [GitHub](https://github.com/nytinteractive/moving_summonses_parser)
|
171
|
+
|
165
172
|
|
166
173
|
## How Does This Work? Like, Theoretically?
|
167
174
|
|
data/lib/tabula.rb
CHANGED
@@ -6,6 +6,7 @@ end
|
|
6
6
|
|
7
7
|
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
8
8
|
require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
|
9
|
+
require File.join(File.dirname(__FILE__), '../target/', 'slf4j-nop-1.7.10.jar')
|
9
10
|
require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
|
10
11
|
require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
|
11
12
|
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -67,11 +67,11 @@ module Tabula
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def get_min_char_width
|
70
|
-
@min_char_width ||= texts.map(&:width).min
|
70
|
+
@min_char_width ||= texts.map(&:width).min || ::Float::INFINITY
|
71
71
|
end
|
72
72
|
|
73
73
|
def get_min_char_height
|
74
|
-
@min_char_height ||= texts.map(&:height).min
|
74
|
+
@min_char_height ||= texts.map(&:height).min || ::Float::INFINITY
|
75
75
|
end
|
76
76
|
|
77
77
|
def get_area(area)
|
@@ -46,6 +46,7 @@ module Tabula
|
|
46
46
|
# returns a list of column boundaries (x axis)
|
47
47
|
# +lines+ must be an array of lines sorted by their +top+ attribute
|
48
48
|
def self.column_positions(lines)
|
49
|
+
return [] if lines.empty?
|
49
50
|
init = lines.first.text_elements.inject([]) { |memo, text_chunk|
|
50
51
|
next memo if text_chunk.text =~ ONLY_SPACES_RE
|
51
52
|
memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
|
@@ -43,23 +43,25 @@ module Tabula
|
|
43
43
|
text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
|
44
44
|
|
45
45
|
|
46
|
-
previousAveCharWidth = text_chunks.first.width
|
47
46
|
endOfLastTextX = text_chunks.first.right
|
48
47
|
maxYForLine = text_chunks.first.bottom
|
49
48
|
maxHeightForLine = text_chunks.first.height
|
50
49
|
minYTopForLine = text_chunks.first.top
|
51
|
-
lastWordSpacing = -1
|
52
50
|
sp = nil
|
53
51
|
|
52
|
+
char_widths_so_far = []
|
53
|
+
word_spacings_so_far = []
|
54
|
+
|
54
55
|
text_elements.inject(text_chunks) do |chunks, char|
|
55
56
|
|
56
57
|
current_chunk = chunks.last
|
57
58
|
prev_char = current_chunk.text_elements.last
|
58
59
|
|
59
|
-
# Resets the
|
60
|
+
# Resets the character/spacing widths (used for averages) when we see a change in font
|
60
61
|
# or a change in the font size
|
61
62
|
if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
|
62
|
-
|
63
|
+
char_widths_so_far = []
|
64
|
+
word_spacings_so_far = []
|
63
65
|
end
|
64
66
|
|
65
67
|
# if same char AND overlapped, skip
|
@@ -78,27 +80,25 @@ module Tabula
|
|
78
80
|
}
|
79
81
|
|
80
82
|
# Estimate the expected width of the space based on the
|
81
|
-
# space character with some margin
|
83
|
+
# average width of the space character with some margin
|
82
84
|
wordSpacing = char.width_of_space
|
83
85
|
deltaSpace = 0
|
84
86
|
deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
|
85
87
|
::Float::MAX
|
86
|
-
elsif
|
88
|
+
elsif word_spacings_so_far.empty?
|
87
89
|
wordSpacing * 0.5 # 0.5 == spacingTolerance
|
88
90
|
else
|
89
|
-
((
|
91
|
+
(word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5
|
90
92
|
end
|
91
93
|
|
94
|
+
word_spacings_so_far << wordSpacing
|
95
|
+
char_widths_so_far << (char.width / char.text.size)
|
96
|
+
|
92
97
|
# Estimate the expected width of the space based on the
|
93
|
-
# average character width with some margin.
|
94
|
-
# make a true average (average of averages) but we found that it gave the
|
95
|
-
# best results after numerous experiments. Based on experiments we also found that
|
98
|
+
# average character width with some margin. Based on experiments we also found that
|
96
99
|
# .3 worked well.
|
97
|
-
averageCharWidth =
|
98
|
-
|
99
|
-
else
|
100
|
-
(previousAveCharWidth + (char.width / char.text.size)) / 2.0
|
101
|
-
end
|
100
|
+
averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size
|
101
|
+
|
102
102
|
deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
|
103
103
|
|
104
104
|
# Compares the values obtained by the average method and the wordSpacing method and picks
|
@@ -119,7 +119,19 @@ module Tabula
|
|
119
119
|
sameLine = false
|
120
120
|
end
|
121
121
|
|
122
|
-
|
122
|
+
# characters tend to be ordered by their left location
|
123
|
+
# in determining whether to add a space, we need to know the distance
|
124
|
+
# between the current character's left and the nearest character's
|
125
|
+
# right. The nearest character may not be the previous character, so we
|
126
|
+
# need to keep track of the character with the greatest right x-axis
|
127
|
+
# location -- that's endOfLastTextX
|
128
|
+
# (in some fonts, one character may be completely "on top of"
|
129
|
+
# another character, with the wider character starting to the left and
|
130
|
+
# ending to the right of the narrower character, e.g. ANSI
|
131
|
+
# representations of some South Asian languages, see
|
132
|
+
# https://github.com/tabulapdf/tabula/issues/303)
|
133
|
+
endOfLastTextX = [char.right, endOfLastTextX].max
|
134
|
+
|
123
135
|
# should we add a space?
|
124
136
|
if !across_vertical_ruling \
|
125
137
|
&& sameLine \
|
@@ -161,11 +173,8 @@ module Tabula
|
|
161
173
|
chunks << TextChunk.create_from_text_element(char)
|
162
174
|
end
|
163
175
|
|
164
|
-
lastWordSpacing = wordSpacing
|
165
|
-
previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
|
166
|
-
|
167
176
|
chunks
|
168
|
-
end
|
177
|
+
end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } }
|
169
178
|
end
|
170
179
|
|
171
180
|
##
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,87 +1,87 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
8
8
|
- Jeremy B. Merill
|
9
9
|
- Mike Tigas
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2015-
|
13
|
+
date: 2015-08-20 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
17
|
-
version_requirements: !ruby/object:Gem::Requirement
|
18
|
-
requirements:
|
19
|
-
- - '>='
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: 1.3.4
|
22
17
|
requirement: !ruby/object:Gem::Requirement
|
23
18
|
requirements:
|
24
|
-
- -
|
19
|
+
- - ">="
|
25
20
|
- !ruby/object:Gem::Version
|
26
21
|
version: 1.3.4
|
27
|
-
prerelease: false
|
28
22
|
type: :development
|
29
|
-
|
30
|
-
name: ruby-debug
|
23
|
+
prerelease: false
|
31
24
|
version_requirements: !ruby/object:Gem::Requirement
|
32
25
|
requirements:
|
33
|
-
- -
|
26
|
+
- - ">="
|
34
27
|
- !ruby/object:Gem::Version
|
35
|
-
version:
|
28
|
+
version: 1.3.4
|
29
|
+
- !ruby/object:Gem::Dependency
|
30
|
+
name: ruby-debug
|
36
31
|
requirement: !ruby/object:Gem::Requirement
|
37
32
|
requirements:
|
38
|
-
- -
|
33
|
+
- - ">="
|
39
34
|
- !ruby/object:Gem::Version
|
40
35
|
version: '0'
|
41
|
-
prerelease: false
|
42
36
|
type: :development
|
43
|
-
|
44
|
-
name: pry
|
37
|
+
prerelease: false
|
45
38
|
version_requirements: !ruby/object:Gem::Requirement
|
46
39
|
requirements:
|
47
|
-
- -
|
40
|
+
- - ">="
|
48
41
|
- !ruby/object:Gem::Version
|
49
42
|
version: '0'
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: pry
|
50
45
|
requirement: !ruby/object:Gem::Requirement
|
51
46
|
requirements:
|
52
|
-
- -
|
47
|
+
- - ">="
|
53
48
|
- !ruby/object:Gem::Version
|
54
49
|
version: '0'
|
55
|
-
prerelease: false
|
56
50
|
type: :development
|
57
|
-
|
58
|
-
name: minitest
|
51
|
+
prerelease: false
|
59
52
|
version_requirements: !ruby/object:Gem::Requirement
|
60
53
|
requirements:
|
61
|
-
- -
|
54
|
+
- - ">="
|
62
55
|
- !ruby/object:Gem::Version
|
63
56
|
version: '0'
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: minitest
|
64
59
|
requirement: !ruby/object:Gem::Requirement
|
65
60
|
requirements:
|
66
|
-
- -
|
61
|
+
- - ">="
|
67
62
|
- !ruby/object:Gem::Version
|
68
63
|
version: '0'
|
69
|
-
prerelease: false
|
70
64
|
type: :development
|
71
|
-
|
72
|
-
name: trollop
|
65
|
+
prerelease: false
|
73
66
|
version_requirements: !ruby/object:Gem::Requirement
|
74
67
|
requirements:
|
75
|
-
- -
|
68
|
+
- - ">="
|
76
69
|
- !ruby/object:Gem::Version
|
77
|
-
version: '
|
70
|
+
version: '0'
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: trollop
|
78
73
|
requirement: !ruby/object:Gem::Requirement
|
79
74
|
requirements:
|
80
|
-
- - ~>
|
75
|
+
- - "~>"
|
81
76
|
- !ruby/object:Gem::Version
|
82
77
|
version: '2.0'
|
83
|
-
prerelease: false
|
84
78
|
type: :runtime
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - "~>"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '2.0'
|
85
85
|
description: extract tables from PDF files
|
86
86
|
email:
|
87
87
|
- manuel@jazzido.com
|
@@ -90,8 +90,8 @@ executables:
|
|
90
90
|
extensions: []
|
91
91
|
extra_rdoc_files: []
|
92
92
|
files:
|
93
|
-
- .gitignore
|
94
|
-
- .travis.yml
|
93
|
+
- ".gitignore"
|
94
|
+
- ".travis.yml"
|
95
95
|
- AUTHORS.md
|
96
96
|
- Gemfile
|
97
97
|
- LICENSE.md
|
@@ -125,29 +125,30 @@ files:
|
|
125
125
|
- target/jsi-1.1.0-SNAPSHOT.jar
|
126
126
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
127
127
|
- target/slf4j-api-1.6.3.jar
|
128
|
+
- target/slf4j-nop-1.7.10.jar
|
128
129
|
- target/trove4j-3.0.3.jar
|
129
130
|
homepage: https://github.com/jazzido/tabula-extractor
|
130
131
|
licenses:
|
131
132
|
- MIT
|
132
133
|
metadata: {}
|
133
|
-
post_install_message:
|
134
|
+
post_install_message:
|
134
135
|
rdoc_options: []
|
135
136
|
require_paths:
|
136
137
|
- lib
|
137
138
|
required_ruby_version: !ruby/object:Gem::Requirement
|
138
139
|
requirements:
|
139
|
-
- -
|
140
|
+
- - ">="
|
140
141
|
- !ruby/object:Gem::Version
|
141
142
|
version: '0'
|
142
143
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
144
|
requirements:
|
144
|
-
- -
|
145
|
+
- - ">="
|
145
146
|
- !ruby/object:Gem::Version
|
146
147
|
version: '0'
|
147
148
|
requirements: []
|
148
|
-
rubyforge_project:
|
149
|
-
rubygems_version: 2.
|
150
|
-
signing_key:
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 2.4.5
|
151
|
+
signing_key:
|
151
152
|
specification_version: 4
|
152
153
|
summary: extract tables from PDF files
|
153
154
|
test_files: []
|