RubyGems - tabula-extractor - Versions diffs - 0.7.6-java → 0.8.0-java - Mend

tabula-extractor 0.7.6-java → 0.8.0-java

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +8 -1
data/lib/tabula.rb +1 -0
data/lib/tabula/entities/page.rb +2 -2
data/lib/tabula/entities/text_chunk.rb +1 -0
data/lib/tabula/entities/text_element.rb +29 -20
data/lib/tabula/version.rb +1 -1
data/target/slf4j-nop-1.7.10.jar +0 -0
metadata +41 -40

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
-  data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
+  metadata.gz: dd31a41b459d191430cf39b8a8c920b5339033e4
+  data.tar.gz: 660bb81f7fc497cdec9550898bd9a895a1d89fb6
 SHA512:
-  metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
-  data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
+  metadata.gz: 38d0dc513c668466cc8f45d102f28003ce4a982ea14dd56b7a71ce2e0de4efd3f237521ea481ea1541a56209ec60249f2eaef33b9be9cede86f37be2a972bed4
+  data.tar.gz: bf4bba7e5817d624e705960cbfc3b9db8ada3229ad4a1e71df4b6b5c82d4245febce6df68e3d3addf2470e00d0342d41ee02b95607358f8aa1bccfc8c67799e2

data/README.md CHANGED

@@ -161,7 +161,14 @@ extractor.extract.each_with_index do |pdf_page, page_index|
 end
 extractor.close!
 out.close
-````
+```
+`tabula-extractor` has also been used successfully as a part of data extraction pipelines. [This blog post](http://open.blogs.nytimes.com/2015/04/03/purifying-the-sea-of-pdf-data-automatically/) discusses a possible pattern for creating these and includes a few examples:
+- Sierra Leone’s Ebola situation reports: [GitHub](https://github.com/jeremybmerrill/ebola_parsers/tree/master/sierra_leone)
+- The NYPD’s CompStat criminal complaints database weekly reports: [GitHub](https://github.com/nytinteractive/compstat_parser)
+- The NYPD’s monthly reports of moving summonses: [GitHub](https://github.com/nytinteractive/moving_summonses_parser)
 ## How Does This Work? Like, Theoretically?

data/lib/tabula.rb CHANGED

@@ -6,6 +6,7 @@ end
 require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
 require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
+require File.join(File.dirname(__FILE__), '../target/', 'slf4j-nop-1.7.10.jar')
 require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
 require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')

data/lib/tabula/entities/page.rb CHANGED

@@ -67,11 +67,11 @@ module Tabula
     end
     def get_min_char_width
-      @min_char_width ||= texts.map(&:width).min
+      @min_char_width ||= texts.map(&:width).min || ::Float::INFINITY
     end
     def get_min_char_height
-      @min_char_height ||= texts.map(&:height).min
+      @min_char_height ||= texts.map(&:height).min || ::Float::INFINITY
     end
     def get_area(area)

data/lib/tabula/entities/text_chunk.rb CHANGED

@@ -46,6 +46,7 @@ module Tabula
     # returns a list of column boundaries (x axis)
     # +lines+ must be an array of lines sorted by their +top+ attribute
     def self.column_positions(lines)
+      return [] if lines.empty?
       init = lines.first.text_elements.inject([]) { |memo, text_chunk|
         next memo if text_chunk.text =~ ONLY_SPACES_RE
         memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)

data/lib/tabula/entities/text_element.rb CHANGED

@@ -43,23 +43,25 @@ module Tabula
       text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
-      previousAveCharWidth = text_chunks.first.width
       endOfLastTextX = text_chunks.first.right
       maxYForLine = text_chunks.first.bottom
       maxHeightForLine = text_chunks.first.height
       minYTopForLine = text_chunks.first.top
-      lastWordSpacing = -1
       sp = nil
+      char_widths_so_far = []
+      word_spacings_so_far = []
       text_elements.inject(text_chunks) do |chunks, char|
         current_chunk = chunks.last
         prev_char = current_chunk.text_elements.last
-        # Resets the average character width when we see a change in font
+        # Resets the character/spacing widths (used for averages) when we see a change in font
         # or a change in the font size
         if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
-          previousAveCharWidth = -1;
+          char_widths_so_far = []
+          word_spacings_so_far = []
         end
         # if same char AND overlapped, skip
@@ -78,27 +80,25 @@ module Tabula
         }
         # Estimate the expected width of the space based on the
-        # space character with some margin.
+        # average width of the space character with some margin
         wordSpacing = char.width_of_space
         deltaSpace  = 0
         deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
                        ::Float::MAX
-                     elsif lastWordSpacing < 0
+                     elsif word_spacings_so_far.empty?
                        wordSpacing * 0.5 # 0.5 == spacingTolerance
                      else
-                       ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
+                       (word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5
                      end
+        word_spacings_so_far << wordSpacing
+        char_widths_so_far << (char.width / char.text.size)
         # Estimate the expected width of the space based on the
-        # average character width with some margin. This calculation does not
-        # make a true average (average of averages) but we found that it gave the
-        # best results after numerous experiments. Based on experiments we also found that
+        # average character width with some margin. Based on experiments we also found that
         # .3 worked well.
-        averageCharWidth = if previousAveCharWidth < 0
-                             char.width / char.text.size
-                           else
-                             (previousAveCharWidth + (char.width / char.text.size)) / 2.0
-                           end
+        averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size
         deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
         # Compares the values obtained by the average method and the wordSpacing method and picks
@@ -119,7 +119,19 @@ module Tabula
           sameLine = false
         end
-        endOfLastTextX = char.right
+        # characters tend to be ordered by their left location
+        # in determining whether to add a space, we need to know the distance
+        # between the current character's left and the nearest character's
+        # right. The nearest character may not be the previous character, so we
+        # need to keep track of the character with the greatest right x-axis
+        # location -- that's endOfLastTextX
+        # (in some fonts, one character may be completely "on top of"
+        # another character, with the wider character starting to the left and
+        # ending to the right of the narrower character,  e.g. ANSI
+        # representations of some South Asian languages, see
+        # https://github.com/tabulapdf/tabula/issues/303)
+        endOfLastTextX = [char.right, endOfLastTextX].max
         # should we add a space?
         if !across_vertical_ruling \
           && sameLine \
@@ -161,11 +173,8 @@ module Tabula
           chunks << TextChunk.create_from_text_element(char)
         end
-        lastWordSpacing = wordSpacing
-        previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
         chunks
-      end
+      end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } }
     end
     ##

data/lib/tabula/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.7.6'
+  VERSION = '0.8.0'
 end

data/target/slf4j-nop-1.7.10.jar ADDED

Binary file

metadata CHANGED

@@ -1,87 +1,87 @@
 --- !ruby/object:Gem::Specification
 name: tabula-extractor
 version: !ruby/object:Gem::Version
-  version: 0.7.6
+  version: 0.8.0
 platform: java
 authors:
 - Manuel Aristarán
 - Jeremy B. Merill
 - Mike Tigas
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-31 00:00:00.000000000 Z
+date: 2015-08-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - '>='
-      - !ruby/object:Gem::Version
-        version: 1.3.4
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.3.4
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: ruby-debug
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.3.4
+- !ruby/object:Gem::Dependency
+  name: ruby-debug
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: pry
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: pry
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: minitest
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: trollop
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: trollop
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '2.0'
-  prerelease: false
   type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
 description: extract tables from PDF files
 email:
 - manuel@jazzido.com
@@ -90,8 +90,8 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .travis.yml
+- ".gitignore"
+- ".travis.yml"
 - AUTHORS.md
 - Gemfile
 - LICENSE.md
@@ -125,29 +125,30 @@ files:
 - target/jsi-1.1.0-SNAPSHOT.jar
 - target/pdfbox-app-2.0.0-SNAPSHOT.jar
 - target/slf4j-api-1.6.3.jar
+- target/slf4j-nop-1.7.10.jar
 - target/trove4j-3.0.3.jar
 homepage: https://github.com/jazzido/tabula-extractor
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.1.9
-signing_key:
+rubyforge_project:
+rubygems_version: 2.4.5
+signing_key:
 specification_version: 4
 summary: extract tables from PDF files
 test_files: []