tabula-extractor 0.7.6-java → 0.8.0-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
4
- data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
3
+ metadata.gz: dd31a41b459d191430cf39b8a8c920b5339033e4
4
+ data.tar.gz: 660bb81f7fc497cdec9550898bd9a895a1d89fb6
5
5
  SHA512:
6
- metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
7
- data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
6
+ metadata.gz: 38d0dc513c668466cc8f45d102f28003ce4a982ea14dd56b7a71ce2e0de4efd3f237521ea481ea1541a56209ec60249f2eaef33b9be9cede86f37be2a972bed4
7
+ data.tar.gz: bf4bba7e5817d624e705960cbfc3b9db8ada3229ad4a1e71df4b6b5c82d4245febce6df68e3d3addf2470e00d0342d41ee02b95607358f8aa1bccfc8c67799e2
data/README.md CHANGED
@@ -161,7 +161,14 @@ extractor.extract.each_with_index do |pdf_page, page_index|
161
161
  end
162
162
  extractor.close!
163
163
  out.close
164
- ````
164
+ ```
165
+
166
+ `tabula-extractor` has also been used successfully as a part of data extraction pipelines. [This blog post](http://open.blogs.nytimes.com/2015/04/03/purifying-the-sea-of-pdf-data-automatically/) discusses a possible pattern for creating these and includes a few examples:
167
+
168
+ - Sierra Leone’s Ebola situation reports: [GitHub](https://github.com/jeremybmerrill/ebola_parsers/tree/master/sierra_leone)
169
+ - The NYPD’s CompStat criminal complaints database weekly reports: [GitHub](https://github.com/nytinteractive/compstat_parser)
170
+ - The NYPD’s monthly reports of moving summonses: [GitHub](https://github.com/nytinteractive/moving_summonses_parser)
171
+
165
172
 
166
173
  ## How Does This Work? Like, Theoretically?
167
174
 
@@ -6,6 +6,7 @@ end
6
6
 
7
7
  require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
8
8
  require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
9
+ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-nop-1.7.10.jar')
9
10
  require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
10
11
  require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
11
12
 
@@ -67,11 +67,11 @@ module Tabula
67
67
  end
68
68
 
69
69
  def get_min_char_width
70
- @min_char_width ||= texts.map(&:width).min
70
+ @min_char_width ||= texts.map(&:width).min || ::Float::INFINITY
71
71
  end
72
72
 
73
73
  def get_min_char_height
74
- @min_char_height ||= texts.map(&:height).min
74
+ @min_char_height ||= texts.map(&:height).min || ::Float::INFINITY
75
75
  end
76
76
 
77
77
  def get_area(area)
@@ -46,6 +46,7 @@ module Tabula
46
46
  # returns a list of column boundaries (x axis)
47
47
  # +lines+ must be an array of lines sorted by their +top+ attribute
48
48
  def self.column_positions(lines)
49
+ return [] if lines.empty?
49
50
  init = lines.first.text_elements.inject([]) { |memo, text_chunk|
50
51
  next memo if text_chunk.text =~ ONLY_SPACES_RE
51
52
  memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
@@ -43,23 +43,25 @@ module Tabula
43
43
  text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
44
44
 
45
45
 
46
- previousAveCharWidth = text_chunks.first.width
47
46
  endOfLastTextX = text_chunks.first.right
48
47
  maxYForLine = text_chunks.first.bottom
49
48
  maxHeightForLine = text_chunks.first.height
50
49
  minYTopForLine = text_chunks.first.top
51
- lastWordSpacing = -1
52
50
  sp = nil
53
51
 
52
+ char_widths_so_far = []
53
+ word_spacings_so_far = []
54
+
54
55
  text_elements.inject(text_chunks) do |chunks, char|
55
56
 
56
57
  current_chunk = chunks.last
57
58
  prev_char = current_chunk.text_elements.last
58
59
 
59
- # Resets the average character width when we see a change in font
60
+ # Resets the character/spacing widths (used for averages) when we see a change in font
60
61
  # or a change in the font size
61
62
  if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
62
- previousAveCharWidth = -1;
63
+ char_widths_so_far = []
64
+ word_spacings_so_far = []
63
65
  end
64
66
 
65
67
  # if same char AND overlapped, skip
@@ -78,27 +80,25 @@ module Tabula
78
80
  }
79
81
 
80
82
  # Estimate the expected width of the space based on the
81
- # space character with some margin.
83
+ # average width of the space character with some margin
82
84
  wordSpacing = char.width_of_space
83
85
  deltaSpace = 0
84
86
  deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
85
87
  ::Float::MAX
86
- elsif lastWordSpacing < 0
88
+ elsif word_spacings_so_far.empty?
87
89
  wordSpacing * 0.5 # 0.5 == spacingTolerance
88
90
  else
89
- ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
91
+ (word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5
90
92
  end
91
93
 
94
+ word_spacings_so_far << wordSpacing
95
+ char_widths_so_far << (char.width / char.text.size)
96
+
92
97
  # Estimate the expected width of the space based on the
93
- # average character width with some margin. This calculation does not
94
- # make a true average (average of averages) but we found that it gave the
95
- # best results after numerous experiments. Based on experiments we also found that
98
+ # average character width with some margin. Based on experiments we also found that
96
99
  # .3 worked well.
97
- averageCharWidth = if previousAveCharWidth < 0
98
- char.width / char.text.size
99
- else
100
- (previousAveCharWidth + (char.width / char.text.size)) / 2.0
101
- end
100
+ averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size
101
+
102
102
  deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
103
103
 
104
104
  # Compares the values obtained by the average method and the wordSpacing method and picks
@@ -119,7 +119,19 @@ module Tabula
119
119
  sameLine = false
120
120
  end
121
121
 
122
- endOfLastTextX = char.right
122
+ # characters tend to be ordered by their left location
123
+ # in determining whether to add a space, we need to know the distance
124
+ # between the current character's left and the nearest character's
125
+ # right. The nearest character may not be the previous character, so we
126
+ # need to keep track of the character with the greatest right x-axis
127
+ # location -- that's endOfLastTextX
128
+ # (in some fonts, one character may be completely "on top of"
129
+ # another character, with the wider character starting to the left and
130
+ # ending to the right of the narrower character, e.g. ANSI
131
+ # representations of some South Asian languages, see
132
+ # https://github.com/tabulapdf/tabula/issues/303)
133
+ endOfLastTextX = [char.right, endOfLastTextX].max
134
+
123
135
  # should we add a space?
124
136
  if !across_vertical_ruling \
125
137
  && sameLine \
@@ -161,11 +173,8 @@ module Tabula
161
173
  chunks << TextChunk.create_from_text_element(char)
162
174
  end
163
175
 
164
- lastWordSpacing = wordSpacing
165
- previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
166
-
167
176
  chunks
168
- end
177
+ end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } }
169
178
  end
170
179
 
171
180
  ##
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.6'
2
+ VERSION = '0.8.0'
3
3
  end
metadata CHANGED
@@ -1,87 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.6
4
+ version: 0.8.0
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
8
8
  - Jeremy B. Merill
9
9
  - Mike Tigas
10
- autorequire:
10
+ autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2015-01-31 00:00:00.000000000 Z
13
+ date: 2015-08-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
17
- version_requirements: !ruby/object:Gem::Requirement
18
- requirements:
19
- - - '>='
20
- - !ruby/object:Gem::Version
21
- version: 1.3.4
22
17
  requirement: !ruby/object:Gem::Requirement
23
18
  requirements:
24
- - - '>='
19
+ - - ">="
25
20
  - !ruby/object:Gem::Version
26
21
  version: 1.3.4
27
- prerelease: false
28
22
  type: :development
29
- - !ruby/object:Gem::Dependency
30
- name: ruby-debug
23
+ prerelease: false
31
24
  version_requirements: !ruby/object:Gem::Requirement
32
25
  requirements:
33
- - - '>='
26
+ - - ">="
34
27
  - !ruby/object:Gem::Version
35
- version: '0'
28
+ version: 1.3.4
29
+ - !ruby/object:Gem::Dependency
30
+ name: ruby-debug
36
31
  requirement: !ruby/object:Gem::Requirement
37
32
  requirements:
38
- - - '>='
33
+ - - ">="
39
34
  - !ruby/object:Gem::Version
40
35
  version: '0'
41
- prerelease: false
42
36
  type: :development
43
- - !ruby/object:Gem::Dependency
44
- name: pry
37
+ prerelease: false
45
38
  version_requirements: !ruby/object:Gem::Requirement
46
39
  requirements:
47
- - - '>='
40
+ - - ">="
48
41
  - !ruby/object:Gem::Version
49
42
  version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: pry
50
45
  requirement: !ruby/object:Gem::Requirement
51
46
  requirements:
52
- - - '>='
47
+ - - ">="
53
48
  - !ruby/object:Gem::Version
54
49
  version: '0'
55
- prerelease: false
56
50
  type: :development
57
- - !ruby/object:Gem::Dependency
58
- name: minitest
51
+ prerelease: false
59
52
  version_requirements: !ruby/object:Gem::Requirement
60
53
  requirements:
61
- - - '>='
54
+ - - ">="
62
55
  - !ruby/object:Gem::Version
63
56
  version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: minitest
64
59
  requirement: !ruby/object:Gem::Requirement
65
60
  requirements:
66
- - - '>='
61
+ - - ">="
67
62
  - !ruby/object:Gem::Version
68
63
  version: '0'
69
- prerelease: false
70
64
  type: :development
71
- - !ruby/object:Gem::Dependency
72
- name: trollop
65
+ prerelease: false
73
66
  version_requirements: !ruby/object:Gem::Requirement
74
67
  requirements:
75
- - - ~>
68
+ - - ">="
76
69
  - !ruby/object:Gem::Version
77
- version: '2.0'
70
+ version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: trollop
78
73
  requirement: !ruby/object:Gem::Requirement
79
74
  requirements:
80
- - - ~>
75
+ - - "~>"
81
76
  - !ruby/object:Gem::Version
82
77
  version: '2.0'
83
- prerelease: false
84
78
  type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: '2.0'
85
85
  description: extract tables from PDF files
86
86
  email:
87
87
  - manuel@jazzido.com
@@ -90,8 +90,8 @@ executables:
90
90
  extensions: []
91
91
  extra_rdoc_files: []
92
92
  files:
93
- - .gitignore
94
- - .travis.yml
93
+ - ".gitignore"
94
+ - ".travis.yml"
95
95
  - AUTHORS.md
96
96
  - Gemfile
97
97
  - LICENSE.md
@@ -125,29 +125,30 @@ files:
125
125
  - target/jsi-1.1.0-SNAPSHOT.jar
126
126
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
127
127
  - target/slf4j-api-1.6.3.jar
128
+ - target/slf4j-nop-1.7.10.jar
128
129
  - target/trove4j-3.0.3.jar
129
130
  homepage: https://github.com/jazzido/tabula-extractor
130
131
  licenses:
131
132
  - MIT
132
133
  metadata: {}
133
- post_install_message:
134
+ post_install_message:
134
135
  rdoc_options: []
135
136
  require_paths:
136
137
  - lib
137
138
  required_ruby_version: !ruby/object:Gem::Requirement
138
139
  requirements:
139
- - - '>='
140
+ - - ">="
140
141
  - !ruby/object:Gem::Version
141
142
  version: '0'
142
143
  required_rubygems_version: !ruby/object:Gem::Requirement
143
144
  requirements:
144
- - - '>='
145
+ - - ">="
145
146
  - !ruby/object:Gem::Version
146
147
  version: '0'
147
148
  requirements: []
148
- rubyforge_project:
149
- rubygems_version: 2.1.9
150
- signing_key:
149
+ rubyforge_project:
150
+ rubygems_version: 2.4.5
151
+ signing_key:
151
152
  specification_version: 4
152
153
  summary: extract tables from PDF files
153
154
  test_files: []