RubyGems - gaspar-pdf - Versions diffs - 0.0.1 → 0.0.2 - Mend

gaspar-pdf 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 61dc07bbed984fb395fe3e6de603f862906c853a
-  data.tar.gz: b1130b45efc157db8a9c0ccc6a6f768d71b6e159
+  metadata.gz: 5d61897dcb86359510caf89940b8b4273f8df92d
+  data.tar.gz: 44482e7a3dd4bb9c53995ad16e49ae9a01142bc0
 SHA512:
-  metadata.gz: 1aa7157bf22ca40cb902fa894a17ac329e52d002b6387fe9fa0850537ab205b4063ad3e6e77a919e1d4e10461c8dc02eabdee733811a4e4481930be54a6fd989
-  data.tar.gz: b4aeb3d4ade931275b57d2924ae1639257634d98460bdfffe85c34a3e7b921d9e4dd7d18fb0f61cd8d17f0540de13875bb097ae582566ce2c49a53459ffee998
+  metadata.gz: a50d871c11e83bb6a696bee1d75df7be959dd2979337d408d3e84c70aa303a04760c343ed255b9385044745fe760bb6889b8881f78f771e1afa5efc8c1c67a42
+  data.tar.gz: 880bee08cf94da0969b4ee738b051a22219b94f0c8facc7bdc34ed9444a54156eb0cf033eb623c2988655229ef2f98b66771eb6bc0d98139f3844f1bf0335c8f

data/README.md CHANGED

@@ -29,13 +29,18 @@ require 'gaspar'
 # This requires that the pdf-table-extract command is present in your PATH.
 Gaspar.parse('document.pdf', 'document.html', {
-  page: 1, format: 'table_html'
+  format: 'table_html'
 })
 # Available options:
 # page - page to parse
 # format -  the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
+content = Gaspar.parse_with_content('document.pdf', 'document.html', {
+  format: 'table_html'
+})
+# you can get parsed content
 ```
 Inspired by [Kristin](https://github.com/ricn/kristin)

data/gaspar-pdf.gemspec CHANGED

@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
   spec.license       = 'MIT'
   spec.files         = `git ls-files`.split($RS)
   spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ['lib']

data/lib/gaspar.rb CHANGED

@@ -2,6 +2,7 @@ require 'gaspar/version'
 require 'open-uri'
 require 'net/http'
 require 'spoon'
+require 'pdf-reader'
 # Gaspar Gem
 module Gaspar
@@ -12,34 +13,92 @@ module Gaspar
       @source = source
       @target = target
       @options = options
+      @extractor = extractor
     end
     def parse
+      @extractor.extract
+    end
+    def parse_with_content
+      @extractor.extract
+      @extractor.content
+    end
+    private
+    def extractor
+      src = determine_source(@source)
+      pdf = Reader.new(src)
+      Extractor.new(
+        src, @target, pdf.page_count, @options
+      )
+    end
+    def random_source_name
+      rand(16**16).to_s(16)
+    end
+    def download_file(source)
+      tmp_file = "/tmp/#{random_source_name}.pdf"
+      File.open(tmp_file, 'wb') do |saved_file|
+        open(URI.encode(source), 'rb') do |read_file|
+          saved_file.write(read_file.read)
+        end
+      end
+      tmp_file
+    end
+    def determine_source(source)
+      is_file = File.exist?(source) && !File.directory?(source)
+      is_http = URI(source).scheme == 'http'
+      is_https = URI(source).scheme == 'https'
+      unless is_file || is_http || is_https
+        raise IOError, "Source (#{source}) is neither a file nor an URL."
+      end
+      is_file ? source : download_file(source)
+    end
+  end
+  # Extract data from all pages of PDF
+  class Extractor
+    def initialize(source, target, pages, options)
+      @source = source
+      @target = target
+      @pages = pages
+      @options = options
+    end
+    def extract
       unless command_available?
         io_error 'Can\'t find pdf-table-extract executable in PATH'
       end
-      src = determine_source(@source)
-      opts = process_options(src).split(' ')
+      opts = process_options.split(' ')
       args = [extract_command, opts].flatten
       pid = Spoon.spawnp(*args)
       Process.waitpid(pid)
+      io_error("Could not parse #{@source}") unless $?.exitstatus.zero?
+    end
-      io_error("Could not parse #{src}") unless $?.exitstatus.zero?
+    def content
+      open(@target, 'rb').read
     end
     private
-    def io_error(error_message)
-      raise IOError, error_message
-    end
-    def process_options(source)
+    def process_options
       opts = []
-      opts.push("-i #{source}") if source
+      opts.push("-i #{@source}") if @source
       opts.push("-o #{@target}") if @target
-      opts.push("-p #{@options[:page]}") if @options[:page]
+      @pages.times do |p|
+        opts.push("-p #{p + 1}")
+      end
       opts.push("-t #{@options[:format]}") if @options[:format]
       opts.join(' ')
@@ -64,38 +123,15 @@ module Gaspar
       nil
     end
-    def random_source_name
-      rand(16**16).to_s(16)
-    end
-    def download_file(source)
-      tmp_file = "/tmp/#{random_source_name}.pdf"
-      File.open(tmp_file, 'wb') do |saved_file|
-        open(URI.encode(source), 'rb') do |read_file|
-          saved_file.write(read_file.read)
-        end
-      end
-      tmp_file
-    end
-    def determine_source(source)
-      is_file = File.exist?(source) && !File.directory?(source)
-      is_http = URI(source).scheme == 'http'
-      is_https = URI(source).scheme == 'https'
-      unless is_file || is_http || is_https
-        raise IOError, "Source (#{source}) is neither a file nor an URL."
-      end
-      is_file ? source : download_file(source)
+    def io_error(error_message)
+      raise IOError, error_message
     end
   end
   # Read infor from PDF file usin pdf-reader
   class Reader
     def initialize(source)
-      @reader = PDF::Reader.new(source)
+      @reader = ::PDF::Reader.new(source)
     end
     def metadata
@@ -111,9 +147,11 @@ module Gaspar
     end
   end
-  # options[:type]
-  # {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
   def self.parse(source, target, options = {})
     Parser.new(source, target, options).parse
   end
+  def self.parse_with_content(source, target, options = {})
+    Parser.new(source, target, options).parse_with_content
+  end
 end

data/lib/gaspar/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Gaspar
-  VERSION = '0.0.1'
+  VERSION = '0.0.2'
 end

data/spec/gaspar_spec.rb CHANGED

@@ -2,35 +2,42 @@ require 'spec_helper'
 describe Gaspar do
   before(:all) do
-    @table_pdf = file_path("table.pdf")
-    @target_path = "#{Dir::tmpdir}"
-    @table_html = @target_path + "/output.html"
+    @table_pdf = file_path('table.pdf')
+    @target_path = Dir.tmpdir.to_s
+    @table_html = @target_path + '/output.html'
   end
   describe '#parse' do
     describe 'with wrong params' do
       it 'should raise error if source file does not exists' do
-        c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
-        expect { c.parse }.to raise_error(IOError)
+        expect do
+          c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
+          c.parse
+        end.to raise_error(IOError)
       end
       it 'should raise error if source is not file nor url' do
-        c = Gaspar::Parser.new('http://  /.pdf', 'unknown.html')
-        expect { c.parse }.to raise_error(URI::InvalidURIError)
+        expect do
+          c = Gaspar::Parser.new('http://  /.pdf', 'unknown.html')
+          c.parse
+        end.to raise_error(URI::InvalidURIError)
       end
     end
     describe 'with write params' do
-      it "should be possible to specify one page" do
-        Gaspar::Parser.new(@table_pdf, @table_html, {
-          page: 2, format: 'table_html'
-        }).parse
+      it 'should be parsed' do
+        Gaspar::Parser.new(@table_pdf, @table_html,
+                           format: 'table_html').parse
         doc = Nokogiri::HTML(File.open(@table_html))
         expect(doc.search('//comment()').text).not_to be_empty
       end
-    end
+      it 'should get content' do
+        content = Gaspar::Parser.new(@table_pdf, @table_html,
+                                     format: 'table_html').parse_with_content
+        expect(content).not_to be_empty
+      end
+    end
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gaspar-pdf
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - 5rabbits