gaspar-pdf 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/gaspar-pdf.gemspec +1 -1
- data/lib/gaspar.rb +76 -38
- data/lib/gaspar/version.rb +1 -1
- data/spec/gaspar_spec.rb +20 -13
- metadata +1 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 5d61897dcb86359510caf89940b8b4273f8df92d
         | 
| 4 | 
            +
              data.tar.gz: 44482e7a3dd4bb9c53995ad16e49ae9a01142bc0
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: a50d871c11e83bb6a696bee1d75df7be959dd2979337d408d3e84c70aa303a04760c343ed255b9385044745fe760bb6889b8881f78f771e1afa5efc8c1c67a42
         | 
| 7 | 
            +
              data.tar.gz: 880bee08cf94da0969b4ee738b051a22219b94f0c8facc7bdc34ed9444a54156eb0cf033eb623c2988655229ef2f98b66771eb6bc0d98139f3844f1bf0335c8f
         | 
    
        data/README.md
    CHANGED
    
    | @@ -29,13 +29,18 @@ require 'gaspar' | |
| 29 29 | 
             
            # This requires that the pdf-table-extract command is present in your PATH.
         | 
| 30 30 |  | 
| 31 31 | 
             
            Gaspar.parse('document.pdf', 'document.html', {
         | 
| 32 | 
            -
               | 
| 32 | 
            +
              format: 'table_html'
         | 
| 33 33 | 
             
            })
         | 
| 34 34 |  | 
| 35 35 | 
             
            # Available options:
         | 
| 36 36 | 
             
            # page - page to parse
         | 
| 37 37 | 
             
            # format -  the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
         | 
| 38 38 |  | 
| 39 | 
            +
            content = Gaspar.parse_with_content('document.pdf', 'document.html', {
         | 
| 40 | 
            +
              format: 'table_html'
         | 
| 41 | 
            +
            })
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            # you can get parsed content
         | 
| 39 44 | 
             
            ```
         | 
| 40 45 |  | 
| 41 46 | 
             
            Inspired by [Kristin](https://github.com/ricn/kristin)
         | 
    
        data/gaspar-pdf.gemspec
    CHANGED
    
    | @@ -14,7 +14,7 @@ Gem::Specification.new do |spec| | |
| 14 14 | 
             
              spec.license       = 'MIT'
         | 
| 15 15 |  | 
| 16 16 | 
             
              spec.files         = `git ls-files`.split($RS)
         | 
| 17 | 
            -
             | 
| 17 | 
            +
             | 
| 18 18 | 
             
              spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
         | 
| 19 19 | 
             
              spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
         | 
| 20 20 | 
             
              spec.require_paths = ['lib']
         | 
    
        data/lib/gaspar.rb
    CHANGED
    
    | @@ -2,6 +2,7 @@ require 'gaspar/version' | |
| 2 2 | 
             
            require 'open-uri'
         | 
| 3 3 | 
             
            require 'net/http'
         | 
| 4 4 | 
             
            require 'spoon'
         | 
| 5 | 
            +
            require 'pdf-reader'
         | 
| 5 6 |  | 
| 6 7 | 
             
            # Gaspar Gem
         | 
| 7 8 | 
             
            module Gaspar
         | 
| @@ -12,34 +13,92 @@ module Gaspar | |
| 12 13 | 
             
                  @source = source
         | 
| 13 14 | 
             
                  @target = target
         | 
| 14 15 | 
             
                  @options = options
         | 
| 16 | 
            +
                  @extractor = extractor
         | 
| 15 17 | 
             
                end
         | 
| 16 18 |  | 
| 17 19 | 
             
                def parse
         | 
| 20 | 
            +
                  @extractor.extract
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                def parse_with_content
         | 
| 24 | 
            +
                  @extractor.extract
         | 
| 25 | 
            +
                  @extractor.content
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                private
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def extractor
         | 
| 31 | 
            +
                  src = determine_source(@source)
         | 
| 32 | 
            +
                  pdf = Reader.new(src)
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                  Extractor.new(
         | 
| 35 | 
            +
                    src, @target, pdf.page_count, @options
         | 
| 36 | 
            +
                  )
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                def random_source_name
         | 
| 40 | 
            +
                  rand(16**16).to_s(16)
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                def download_file(source)
         | 
| 44 | 
            +
                  tmp_file = "/tmp/#{random_source_name}.pdf"
         | 
| 45 | 
            +
                  File.open(tmp_file, 'wb') do |saved_file|
         | 
| 46 | 
            +
                    open(URI.encode(source), 'rb') do |read_file|
         | 
| 47 | 
            +
                      saved_file.write(read_file.read)
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  tmp_file
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                def determine_source(source)
         | 
| 55 | 
            +
                  is_file = File.exist?(source) && !File.directory?(source)
         | 
| 56 | 
            +
                  is_http = URI(source).scheme == 'http'
         | 
| 57 | 
            +
                  is_https = URI(source).scheme == 'https'
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  unless is_file || is_http || is_https
         | 
| 60 | 
            +
                    raise IOError, "Source (#{source}) is neither a file nor an URL."
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  is_file ? source : download_file(source)
         | 
| 64 | 
            +
                end
         | 
| 65 | 
            +
              end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              # Extract data from all pages of PDF
         | 
| 68 | 
            +
              class Extractor
         | 
| 69 | 
            +
                def initialize(source, target, pages, options)
         | 
| 70 | 
            +
                  @source = source
         | 
| 71 | 
            +
                  @target = target
         | 
| 72 | 
            +
                  @pages = pages
         | 
| 73 | 
            +
                  @options = options
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                def extract
         | 
| 18 77 | 
             
                  unless command_available?
         | 
| 19 78 | 
             
                    io_error 'Can\'t find pdf-table-extract executable in PATH'
         | 
| 20 79 | 
             
                  end
         | 
| 21 80 |  | 
| 22 | 
            -
                   | 
| 23 | 
            -
                  opts = process_options(src).split(' ')
         | 
| 81 | 
            +
                  opts = process_options.split(' ')
         | 
| 24 82 | 
             
                  args = [extract_command, opts].flatten
         | 
| 25 83 |  | 
| 26 84 | 
             
                  pid = Spoon.spawnp(*args)
         | 
| 27 85 | 
             
                  Process.waitpid(pid)
         | 
| 86 | 
            +
                  io_error("Could not parse #{@source}") unless $?.exitstatus.zero?
         | 
| 87 | 
            +
                end
         | 
| 28 88 |  | 
| 29 | 
            -
             | 
| 89 | 
            +
                def content
         | 
| 90 | 
            +
                  open(@target, 'rb').read
         | 
| 30 91 | 
             
                end
         | 
| 31 92 |  | 
| 32 93 | 
             
                private
         | 
| 33 94 |  | 
| 34 | 
            -
                def  | 
| 35 | 
            -
                  raise IOError, error_message
         | 
| 36 | 
            -
                end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                def process_options(source)
         | 
| 95 | 
            +
                def process_options
         | 
| 39 96 | 
             
                  opts = []
         | 
| 40 | 
            -
                  opts.push("-i #{source}") if source
         | 
| 97 | 
            +
                  opts.push("-i #{@source}") if @source
         | 
| 41 98 | 
             
                  opts.push("-o #{@target}") if @target
         | 
| 42 | 
            -
                   | 
| 99 | 
            +
                  @pages.times do |p|
         | 
| 100 | 
            +
                    opts.push("-p #{p + 1}")
         | 
| 101 | 
            +
                  end
         | 
| 43 102 | 
             
                  opts.push("-t #{@options[:format]}") if @options[:format]
         | 
| 44 103 |  | 
| 45 104 | 
             
                  opts.join(' ')
         | 
| @@ -64,38 +123,15 @@ module Gaspar | |
| 64 123 | 
             
                  nil
         | 
| 65 124 | 
             
                end
         | 
| 66 125 |  | 
| 67 | 
            -
                def  | 
| 68 | 
            -
                   | 
| 69 | 
            -
                end
         | 
| 70 | 
            -
             | 
| 71 | 
            -
                def download_file(source)
         | 
| 72 | 
            -
                  tmp_file = "/tmp/#{random_source_name}.pdf"
         | 
| 73 | 
            -
                  File.open(tmp_file, 'wb') do |saved_file|
         | 
| 74 | 
            -
                    open(URI.encode(source), 'rb') do |read_file|
         | 
| 75 | 
            -
                      saved_file.write(read_file.read)
         | 
| 76 | 
            -
                    end
         | 
| 77 | 
            -
                  end
         | 
| 78 | 
            -
             | 
| 79 | 
            -
                  tmp_file
         | 
| 80 | 
            -
                end
         | 
| 81 | 
            -
             | 
| 82 | 
            -
                def determine_source(source)
         | 
| 83 | 
            -
                  is_file = File.exist?(source) && !File.directory?(source)
         | 
| 84 | 
            -
                  is_http = URI(source).scheme == 'http'
         | 
| 85 | 
            -
                  is_https = URI(source).scheme == 'https'
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                  unless is_file || is_http || is_https
         | 
| 88 | 
            -
                    raise IOError, "Source (#{source}) is neither a file nor an URL."
         | 
| 89 | 
            -
                  end
         | 
| 90 | 
            -
             | 
| 91 | 
            -
                  is_file ? source : download_file(source)
         | 
| 126 | 
            +
                def io_error(error_message)
         | 
| 127 | 
            +
                  raise IOError, error_message
         | 
| 92 128 | 
             
                end
         | 
| 93 129 | 
             
              end
         | 
| 94 130 |  | 
| 95 131 | 
             
              # Read infor from PDF file usin pdf-reader
         | 
| 96 132 | 
             
              class Reader
         | 
| 97 133 | 
             
                def initialize(source)
         | 
| 98 | 
            -
                  @reader = PDF::Reader.new(source)
         | 
| 134 | 
            +
                  @reader = ::PDF::Reader.new(source)
         | 
| 99 135 | 
             
                end
         | 
| 100 136 |  | 
| 101 137 | 
             
                def metadata
         | 
| @@ -111,9 +147,11 @@ module Gaspar | |
| 111 147 | 
             
                end
         | 
| 112 148 | 
             
              end
         | 
| 113 149 |  | 
| 114 | 
            -
              # options[:type]
         | 
| 115 | 
            -
              # {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
         | 
| 116 150 | 
             
              def self.parse(source, target, options = {})
         | 
| 117 151 | 
             
                Parser.new(source, target, options).parse
         | 
| 118 152 | 
             
              end
         | 
| 153 | 
            +
             | 
| 154 | 
            +
              def self.parse_with_content(source, target, options = {})
         | 
| 155 | 
            +
                Parser.new(source, target, options).parse_with_content
         | 
| 156 | 
            +
              end
         | 
| 119 157 | 
             
            end
         | 
    
        data/lib/gaspar/version.rb
    CHANGED
    
    
    
        data/spec/gaspar_spec.rb
    CHANGED
    
    | @@ -2,35 +2,42 @@ require 'spec_helper' | |
| 2 2 |  | 
| 3 3 | 
             
            describe Gaspar do
         | 
| 4 4 | 
             
              before(:all) do
         | 
| 5 | 
            -
                @table_pdf = file_path( | 
| 6 | 
            -
                @target_path =  | 
| 7 | 
            -
                @table_html = @target_path +  | 
| 5 | 
            +
                @table_pdf = file_path('table.pdf')
         | 
| 6 | 
            +
                @target_path = Dir.tmpdir.to_s
         | 
| 7 | 
            +
                @table_html = @target_path + '/output.html'
         | 
| 8 8 | 
             
              end
         | 
| 9 9 |  | 
| 10 | 
            -
             | 
| 11 10 | 
             
              describe '#parse' do
         | 
| 12 11 | 
             
                describe 'with wrong params' do
         | 
| 13 12 | 
             
                  it 'should raise error if source file does not exists' do
         | 
| 14 | 
            -
                     | 
| 15 | 
            -
             | 
| 13 | 
            +
                    expect do
         | 
| 14 | 
            +
                      c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
         | 
| 15 | 
            +
                      c.parse
         | 
| 16 | 
            +
                    end.to raise_error(IOError)
         | 
| 16 17 | 
             
                  end
         | 
| 17 18 |  | 
| 18 19 | 
             
                  it 'should raise error if source is not file nor url' do
         | 
| 19 | 
            -
                     | 
| 20 | 
            -
             | 
| 20 | 
            +
                    expect do
         | 
| 21 | 
            +
                      c = Gaspar::Parser.new('http://  /.pdf', 'unknown.html')
         | 
| 22 | 
            +
                      c.parse
         | 
| 23 | 
            +
                    end.to raise_error(URI::InvalidURIError)
         | 
| 21 24 | 
             
                  end
         | 
| 22 25 | 
             
                end
         | 
| 23 26 |  | 
| 24 27 | 
             
                describe 'with write params' do
         | 
| 25 | 
            -
                  it  | 
| 26 | 
            -
                    Gaspar::Parser.new(@table_pdf, @table_html, | 
| 27 | 
            -
             | 
| 28 | 
            -
                    }).parse
         | 
| 28 | 
            +
                  it 'should be parsed' do
         | 
| 29 | 
            +
                    Gaspar::Parser.new(@table_pdf, @table_html,
         | 
| 30 | 
            +
                                       format: 'table_html').parse
         | 
| 29 31 |  | 
| 30 32 | 
             
                    doc = Nokogiri::HTML(File.open(@table_html))
         | 
| 31 33 | 
             
                    expect(doc.search('//comment()').text).not_to be_empty
         | 
| 32 34 | 
             
                  end
         | 
| 33 | 
            -
                end
         | 
| 34 35 |  | 
| 36 | 
            +
                  it 'should get content' do
         | 
| 37 | 
            +
                    content = Gaspar::Parser.new(@table_pdf, @table_html,
         | 
| 38 | 
            +
                                                 format: 'table_html').parse_with_content
         | 
| 39 | 
            +
                    expect(content).not_to be_empty
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
                end
         | 
| 35 42 | 
             
              end
         | 
| 36 43 | 
             
            end
         |