RubyGems - tabula-extractor - Versions diffs - 0.7.6-java → 0.8.0-java - Mend

tabula-extractor 0.7.6-java → 0.8.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +8 -1
data/lib/tabula.rb +1 -0
data/lib/tabula/entities/page.rb +2 -2
data/lib/tabula/entities/text_chunk.rb +1 -0
data/lib/tabula/entities/text_element.rb +29 -20
data/lib/tabula/version.rb +1 -1
data/target/slf4j-nop-1.7.10.jar +0 -0
metadata +41 -40

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9ac7f1daa082acce10e82b94b01b31e07813ad4d
-  data.tar.gz: ac521bbba80d6b0571d904565cd31d9af5e7947a
+  metadata.gz: dd31a41b459d191430cf39b8a8c920b5339033e4
+  data.tar.gz: 660bb81f7fc497cdec9550898bd9a895a1d89fb6
 SHA512:
-  metadata.gz: 0389d96e5a7a8ad20c147ed3170b922a501126211bec58b012f39662425599437f3f869002825b562cae57d44645ddb776088804f237e168b71473211a86c67a
-  data.tar.gz: 53dd7bd11684bf8b8ccd03ea9352140eb3dcc346b7018bee7d6b8049e7e70ee02f59b83e360d97ddf5b7f211a1dccbb83694aab8cb016c7a4ba656f46a37c4c4
+  metadata.gz: 38d0dc513c668466cc8f45d102f28003ce4a982ea14dd56b7a71ce2e0de4efd3f237521ea481ea1541a56209ec60249f2eaef33b9be9cede86f37be2a972bed4
+  data.tar.gz: bf4bba7e5817d624e705960cbfc3b9db8ada3229ad4a1e71df4b6b5c82d4245febce6df68e3d3addf2470e00d0342d41ee02b95607358f8aa1bccfc8c67799e2

data/README.md CHANGED

@@ -161,7 +161,14 @@ extractor.extract.each_with_index do |pdf_page, page_index|
 end
 extractor.close!
 out.close
-````
+```
+`tabula-extractor` has also been used successfully as a part of data extraction pipelines. [This blog post](http://open.blogs.nytimes.com/2015/04/03/purifying-the-sea-of-pdf-data-automatically/) discusses a possible pattern for creating these and includes a few examples:
+- Sierra Leone’s Ebola situation reports: [GitHub](https://github.com/jeremybmerrill/ebola_parsers/tree/master/sierra_leone)
+- The NYPD’s CompStat criminal complaints database weekly reports: [GitHub](https://github.com/nytinteractive/compstat_parser)
+- The NYPD’s monthly reports of moving summonses: [GitHub](https://github.com/nytinteractive/moving_summonses_parser)
 ## How Does This Work? Like, Theoretically?

data/lib/tabula.rb CHANGED

@@ -6,6 +6,7 @@ end
 require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
 require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
+require File.join(File.dirname(__FILE__), '../target/', 'slf4j-nop-1.7.10.jar')
 require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
 require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')

data/lib/tabula/entities/page.rb CHANGED

@@ -67,11 +67,11 @@ module Tabula
     end
     def get_min_char_width
-      @min_char_width ||= texts.map(&:width).min
+      @min_char_width ||= texts.map(&:width).min || ::Float::INFINITY
     end
     def get_min_char_height
-      @min_char_height ||= texts.map(&:height).min
+      @min_char_height ||= texts.map(&:height).min || ::Float::INFINITY
     end
     def get_area(area)

data/lib/tabula/entities/text_chunk.rb CHANGED

@@ -46,6 +46,7 @@ module Tabula
     # returns a list of column boundaries (x axis)
     # +lines+ must be an array of lines sorted by their +top+ attribute
     def self.column_positions(lines)
+      return [] if lines.empty?
       init = lines.first.text_elements.inject([]) { |memo, text_chunk|
         next memo if text_chunk.text =~ ONLY_SPACES_RE
         memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)

data/lib/tabula/entities/text_element.rb CHANGED

@@ -43,23 +43,25 @@ module Tabula
       text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
-      previousAveCharWidth = text_chunks.first.width
       endOfLastTextX = text_chunks.first.right
       maxYForLine = text_chunks.first.bottom
       maxHeightForLine = text_chunks.first.height
       minYTopForLine = text_chunks.first.top
-      lastWordSpacing = -1
       sp = nil
+      char_widths_so_far = []
+      word_spacings_so_far = []
       text_elements.inject(text_chunks) do |chunks, char|
         current_chunk = chunks.last
         prev_char = current_chunk.text_elements.last
-        # Resets the average character width when we see a change in font
+        # Resets the character/spacing widths (used for averages) when we see a change in font
         # or a change in the font size
         if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
-          previousAveCharWidth = -1;
+          char_widths_so_far = []
+          word_spacings_so_far = []
         end
         # if same char AND overlapped, skip
@@ -78,27 +80,25 @@ module Tabula
         }
         # Estimate the expected width of the space based on the
-        # space character with some margin.
+        # average width of the space character with some margin
         wordSpacing = char.width_of_space
         deltaSpace  = 0
         deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
                        ::Float::MAX
-                     elsif lastWordSpacing < 0
+                     elsif word_spacings_so_far.empty?
                        wordSpacing * 0.5 # 0.5 == spacingTolerance
                      else
-                       ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
+                       (word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5
                      end
+        word_spacings_so_far << wordSpacing
+        char_widths_so_far << (char.width / char.text.size)
         # Estimate the expected width of the space based on the
-        # average character width with some margin. This calculation does not
-        # make a true average (average of averages) but we found that it gave the
-        # best results after numerous experiments. Based on experiments we also found that
+        # average character width with some margin. Based on experiments we also found that
         # .3 worked well.
-        averageCharWidth = if previousAveCharWidth < 0
-                             char.width / char.text.size
-                           else
-                             (previousAveCharWidth + (char.width / char.text.size)) / 2.0
-                           end
+        averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size
         deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
         # Compares the values obtained by the average method and the wordSpacing method and picks
@@ -119,7 +119,19 @@ module Tabula
           sameLine = false
         end
-        endOfLastTextX = char.right
+        # characters tend to be ordered by their left location
+        # in determining whether to add a space, we need to know the distance
+        # between the current character's left and the nearest character's
+        # right. The nearest character may not be the previous character, so we
+        # need to keep track of the character with the greatest right x-axis
+        # location -- that's endOfLastTextX
+        # (in some fonts, one character may be completely "on top of"
+        # another character, with the wider character starting to the left and
+        # ending to the right of the narrower character,  e.g. ANSI
+        # representations of some South Asian languages, see
+        # https://github.com/tabulapdf/tabula/issues/303)
+        endOfLastTextX = [char.right, endOfLastTextX].max
         # should we add a space?
         if !across_vertical_ruling \
           && sameLine \
@@ -161,11 +173,8 @@ module Tabula
           chunks << TextChunk.create_from_text_element(char)
         end
-        lastWordSpacing = wordSpacing
-        previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
         chunks
-      end
+      end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } }
     end
     ##

data/lib/tabula/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.7.6'
+  VERSION = '0.8.0'
 end

data/target/slf4j-nop-1.7.10.jar ADDED

Binary file

metadata CHANGED

@@ -1,87 +1,87 @@
 --- !ruby/object:Gem::Specification
 name: tabula-extractor
 version: !ruby/object:Gem::Version
-  version: 0.7.6
+  version: 0.8.0
 platform: java
 authors:
 - Manuel Aristarán
 - Jeremy B. Merill
 - Mike Tigas
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-31 00:00:00.000000000 Z
+date: 2015-08-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - '>='
-      - !ruby/object:Gem::Version
-        version: 1.3.4
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.3.4
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: ruby-debug
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.3.4
+- !ruby/object:Gem::Dependency
+  name: ruby-debug
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: pry
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: pry
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: minitest
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  prerelease: false
   type: :development
-- !ruby/object:Gem::Dependency
-  name: trollop
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: trollop
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '2.0'
-  prerelease: false
   type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
 description: extract tables from PDF files
 email:
 - manuel@jazzido.com
@@ -90,8 +90,8 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .travis.yml
+- ".gitignore"
+- ".travis.yml"
 - AUTHORS.md
 - Gemfile
 - LICENSE.md
@@ -125,29 +125,30 @@ files:
 - target/jsi-1.1.0-SNAPSHOT.jar
 - target/pdfbox-app-2.0.0-SNAPSHOT.jar
 - target/slf4j-api-1.6.3.jar
+- target/slf4j-nop-1.7.10.jar
 - target/trove4j-3.0.3.jar
 homepage: https://github.com/jazzido/tabula-extractor
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.1.9
-signing_key:
+rubyforge_project:
+rubygems_version: 2.4.5
+signing_key:
 specification_version: 4
 summary: extract tables from PDF files
 test_files: []