tabula-extractor 0.7.6-java → 0.8.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
4
- data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
3
+ metadata.gz: dd31a41b459d191430cf39b8a8c920b5339033e4
4
+ data.tar.gz: 660bb81f7fc497cdec9550898bd9a895a1d89fb6
5
5
  SHA512:
6
- metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
7
- data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
6
+ metadata.gz: 38d0dc513c668466cc8f45d102f28003ce4a982ea14dd56b7a71ce2e0de4efd3f237521ea481ea1541a56209ec60249f2eaef33b9be9cede86f37be2a972bed4
7
+ data.tar.gz: bf4bba7e5817d624e705960cbfc3b9db8ada3229ad4a1e71df4b6b5c82d4245febce6df68e3d3addf2470e00d0342d41ee02b95607358f8aa1bccfc8c67799e2
data/README.md CHANGED
@@ -161,7 +161,14 @@ extractor.extract.each_with_index do |pdf_page, page_index|
161
161
  end
162
162
  extractor.close!
163
163
  out.close
164
- ````
164
+ ```
165
+
166
+ `tabula-extractor` has also been used successfully as a part of data extraction pipelines. [This blog post](http://open.blogs.nytimes.com/2015/04/03/purifying-the-sea-of-pdf-data-automatically/) discusses a possible pattern for creating these and includes a few examples:
167
+
168
+ - Sierra Leone’s Ebola situation reports: [GitHub](https://github.com/jeremybmerrill/ebola_parsers/tree/master/sierra_leone)
169
+ - The NYPD’s CompStat criminal complaints database weekly reports: [GitHub](https://github.com/nytinteractive/compstat_parser)
170
+ - The NYPD’s monthly reports of moving summonses: [GitHub](https://github.com/nytinteractive/moving_summonses_parser)
171
+
165
172
 
166
173
  ## How Does This Work? Like, Theoretically?
167
174
 
@@ -6,6 +6,7 @@ end
6
6
 
7
7
  require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
8
8
  require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
9
+ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-nop-1.7.10.jar')
9
10
  require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
10
11
  require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
11
12
 
@@ -67,11 +67,11 @@ module Tabula
67
67
  end
68
68
 
69
69
  def get_min_char_width
70
- @min_char_width ||= texts.map(&:width).min
70
+ @min_char_width ||= texts.map(&:width).min || ::Float::INFINITY
71
71
  end
72
72
 
73
73
  def get_min_char_height
74
- @min_char_height ||= texts.map(&:height).min
74
+ @min_char_height ||= texts.map(&:height).min || ::Float::INFINITY
75
75
  end
76
76
 
77
77
  def get_area(area)
@@ -46,6 +46,7 @@ module Tabula
46
46
  # returns a list of column boundaries (x axis)
47
47
  # +lines+ must be an array of lines sorted by their +top+ attribute
48
48
  def self.column_positions(lines)
49
+ return [] if lines.empty?
49
50
  init = lines.first.text_elements.inject([]) { |memo, text_chunk|
50
51
  next memo if text_chunk.text =~ ONLY_SPACES_RE
51
52
  memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
@@ -43,23 +43,25 @@ module Tabula
43
43
  text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
44
44
 
45
45
 
46
- previousAveCharWidth = text_chunks.first.width
47
46
  endOfLastTextX = text_chunks.first.right
48
47
  maxYForLine = text_chunks.first.bottom
49
48
  maxHeightForLine = text_chunks.first.height
50
49
  minYTopForLine = text_chunks.first.top
51
- lastWordSpacing = -1
52
50
  sp = nil
53
51
 
52
+ char_widths_so_far = []
53
+ word_spacings_so_far = []
54
+
54
55
  text_elements.inject(text_chunks) do |chunks, char|
55
56
 
56
57
  current_chunk = chunks.last
57
58
  prev_char = current_chunk.text_elements.last
58
59
 
59
- # Resets the average character width when we see a change in font
60
+ # Resets the character/spacing widths (used for averages) when we see a change in font
60
61
  # or a change in the font size
61
62
  if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
62
- previousAveCharWidth = -1;
63
+ char_widths_so_far = []
64
+ word_spacings_so_far = []
63
65
  end
64
66
 
65
67
  # if same char AND overlapped, skip
@@ -78,27 +80,25 @@ module Tabula
78
80
  }
79
81
 
80
82
  # Estimate the expected width of the space based on the
81
- # space character with some margin.
83
+ # average width of the space character with some margin
82
84
  wordSpacing = char.width_of_space
83
85
  deltaSpace = 0
84
86
  deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
85
87
  ::Float::MAX
86
- elsif lastWordSpacing < 0
88
+ elsif word_spacings_so_far.empty?
87
89
  wordSpacing * 0.5 # 0.5 == spacingTolerance
88
90
  else
89
- ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
91
+ (word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5
90
92
  end
91
93
 
94
+ word_spacings_so_far << wordSpacing
95
+ char_widths_so_far << (char.width / char.text.size)
96
+
92
97
  # Estimate the expected width of the space based on the
93
- # average character width with some margin. This calculation does not
94
- # make a true average (average of averages) but we found that it gave the
95
- # best results after numerous experiments. Based on experiments we also found that
98
+ # average character width with some margin. Based on experiments we also found that
96
99
  # .3 worked well.
97
- averageCharWidth = if previousAveCharWidth < 0
98
- char.width / char.text.size
99
- else
100
- (previousAveCharWidth + (char.width / char.text.size)) / 2.0
101
- end
100
+ averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size
101
+
102
102
  deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
103
103
 
104
104
  # Compares the values obtained by the average method and the wordSpacing method and picks
@@ -119,7 +119,19 @@ module Tabula
119
119
  sameLine = false
120
120
  end
121
121
 
122
- endOfLastTextX = char.right
122
+ # characters tend to be ordered by their left location
123
+ # in determining whether to add a space, we need to know the distance
124
+ # between the current character's left and the nearest character's
125
+ # right. The nearest character may not be the previous character, so we
126
+ # need to keep track of the character with the greatest right x-axis
127
+ # location -- that's endOfLastTextX
128
+ # (in some fonts, one character may be completely "on top of"
129
+ # another character, with the wider character starting to the left and
130
+ # ending to the right of the narrower character, e.g. ANSI
131
+ # representations of some South Asian languages, see
132
+ # https://github.com/tabulapdf/tabula/issues/303)
133
+ endOfLastTextX = [char.right, endOfLastTextX].max
134
+
123
135
  # should we add a space?
124
136
  if !across_vertical_ruling \
125
137
  && sameLine \
@@ -161,11 +173,8 @@ module Tabula
161
173
  chunks << TextChunk.create_from_text_element(char)
162
174
  end
163
175
 
164
- lastWordSpacing = wordSpacing
165
- previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
166
-
167
176
  chunks
168
- end
177
+ end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } }
169
178
  end
170
179
 
171
180
  ##
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.6'
2
+ VERSION = '0.8.0'
3
3
  end
metadata CHANGED
@@ -1,87 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.6
4
+ version: 0.8.0
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
8
8
  - Jeremy B. Merill
9
9
  - Mike Tigas
10
- autorequire:
10
+ autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2015-01-31 00:00:00.000000000 Z
13
+ date: 2015-08-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
17
- version_requirements: !ruby/object:Gem::Requirement
18
- requirements:
19
- - - '>='
20
- - !ruby/object:Gem::Version
21
- version: 1.3.4
22
17
  requirement: !ruby/object:Gem::Requirement
23
18
  requirements:
24
- - - '>='
19
+ - - ">="
25
20
  - !ruby/object:Gem::Version
26
21
  version: 1.3.4
27
- prerelease: false
28
22
  type: :development
29
- - !ruby/object:Gem::Dependency
30
- name: ruby-debug
23
+ prerelease: false
31
24
  version_requirements: !ruby/object:Gem::Requirement
32
25
  requirements:
33
- - - '>='
26
+ - - ">="
34
27
  - !ruby/object:Gem::Version
35
- version: '0'
28
+ version: 1.3.4
29
+ - !ruby/object:Gem::Dependency
30
+ name: ruby-debug
36
31
  requirement: !ruby/object:Gem::Requirement
37
32
  requirements:
38
- - - '>='
33
+ - - ">="
39
34
  - !ruby/object:Gem::Version
40
35
  version: '0'
41
- prerelease: false
42
36
  type: :development
43
- - !ruby/object:Gem::Dependency
44
- name: pry
37
+ prerelease: false
45
38
  version_requirements: !ruby/object:Gem::Requirement
46
39
  requirements:
47
- - - '>='
40
+ - - ">="
48
41
  - !ruby/object:Gem::Version
49
42
  version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: pry
50
45
  requirement: !ruby/object:Gem::Requirement
51
46
  requirements:
52
- - - '>='
47
+ - - ">="
53
48
  - !ruby/object:Gem::Version
54
49
  version: '0'
55
- prerelease: false
56
50
  type: :development
57
- - !ruby/object:Gem::Dependency
58
- name: minitest
51
+ prerelease: false
59
52
  version_requirements: !ruby/object:Gem::Requirement
60
53
  requirements:
61
- - - '>='
54
+ - - ">="
62
55
  - !ruby/object:Gem::Version
63
56
  version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: minitest
64
59
  requirement: !ruby/object:Gem::Requirement
65
60
  requirements:
66
- - - '>='
61
+ - - ">="
67
62
  - !ruby/object:Gem::Version
68
63
  version: '0'
69
- prerelease: false
70
64
  type: :development
71
- - !ruby/object:Gem::Dependency
72
- name: trollop
65
+ prerelease: false
73
66
  version_requirements: !ruby/object:Gem::Requirement
74
67
  requirements:
75
- - - ~>
68
+ - - ">="
76
69
  - !ruby/object:Gem::Version
77
- version: '2.0'
70
+ version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: trollop
78
73
  requirement: !ruby/object:Gem::Requirement
79
74
  requirements:
80
- - - ~>
75
+ - - "~>"
81
76
  - !ruby/object:Gem::Version
82
77
  version: '2.0'
83
- prerelease: false
84
78
  type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: '2.0'
85
85
  description: extract tables from PDF files
86
86
  email:
87
87
  - manuel@jazzido.com
@@ -90,8 +90,8 @@ executables:
90
90
  extensions: []
91
91
  extra_rdoc_files: []
92
92
  files:
93
- - .gitignore
94
- - .travis.yml
93
+ - ".gitignore"
94
+ - ".travis.yml"
95
95
  - AUTHORS.md
96
96
  - Gemfile
97
97
  - LICENSE.md
@@ -125,29 +125,30 @@ files:
125
125
  - target/jsi-1.1.0-SNAPSHOT.jar
126
126
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
127
127
  - target/slf4j-api-1.6.3.jar
128
+ - target/slf4j-nop-1.7.10.jar
128
129
  - target/trove4j-3.0.3.jar
129
130
  homepage: https://github.com/jazzido/tabula-extractor
130
131
  licenses:
131
132
  - MIT
132
133
  metadata: {}
133
- post_install_message:
134
+ post_install_message:
134
135
  rdoc_options: []
135
136
  require_paths:
136
137
  - lib
137
138
  required_ruby_version: !ruby/object:Gem::Requirement
138
139
  requirements:
139
- - - '>='
140
+ - - ">="
140
141
  - !ruby/object:Gem::Version
141
142
  version: '0'
142
143
  required_rubygems_version: !ruby/object:Gem::Requirement
143
144
  requirements:
144
- - - '>='
145
+ - - ">="
145
146
  - !ruby/object:Gem::Version
146
147
  version: '0'
147
148
  requirements: []
148
- rubyforge_project:
149
- rubygems_version: 2.1.9
150
- signing_key:
149
+ rubyforge_project:
150
+ rubygems_version: 2.4.5
151
+ signing_key:
151
152
  specification_version: 4
152
153
  summary: extract tables from PDF files
153
154
  test_files: []