gaspar-pdf 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 61dc07bbed984fb395fe3e6de603f862906c853a
4
+ data.tar.gz: b1130b45efc157db8a9c0ccc6a6f768d71b6e159
5
+ SHA512:
6
+ metadata.gz: 1aa7157bf22ca40cb902fa894a17ac329e52d002b6387fe9fa0850537ab205b4063ad3e6e77a919e1d4e10461c8dc02eabdee733811a4e4481930be54a6fd989
7
+ data.tar.gz: b4aeb3d4ade931275b57d2924ae1639257634d98460bdfffe85c34a3e7b921d9e4dd7d18fb0f61cd8d17f0540de13875bb097ae582566ce2c49a53459ffee998
data/.gitignore ADDED
@@ -0,0 +1,51 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
51
+ .DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,57 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gaspar-pdf (0.0.1)
5
+ pdf-reader
6
+ spoon
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ Ascii85 (1.0.2)
12
+ afm (0.2.2)
13
+ diff-lcs (1.2.5)
14
+ ffi (1.9.14)
15
+ hashery (2.1.2)
16
+ mini_portile2 (2.1.0)
17
+ nokogiri (1.6.8)
18
+ mini_portile2 (~> 2.1.0)
19
+ pkg-config (~> 1.1.7)
20
+ pdf-reader (1.4.0)
21
+ Ascii85 (~> 1.0.0)
22
+ afm (~> 0.2.1)
23
+ hashery (~> 2.0)
24
+ ruby-rc4
25
+ ttfunk
26
+ pkg-config (1.1.7)
27
+ rake (11.2.2)
28
+ rspec (3.5.0)
29
+ rspec-core (~> 3.5.0)
30
+ rspec-expectations (~> 3.5.0)
31
+ rspec-mocks (~> 3.5.0)
32
+ rspec-core (3.5.2)
33
+ rspec-support (~> 3.5.0)
34
+ rspec-expectations (3.5.0)
35
+ diff-lcs (>= 1.2.0, < 2.0)
36
+ rspec-support (~> 3.5.0)
37
+ rspec-mocks (3.5.0)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.5.0)
40
+ rspec-support (3.5.0)
41
+ ruby-rc4 (0.1.5)
42
+ spoon (0.0.4)
43
+ ffi
44
+ ttfunk (1.4.0)
45
+
46
+ PLATFORMS
47
+ ruby
48
+
49
+ DEPENDENCIES
50
+ bundler
51
+ gaspar-pdf!
52
+ nokogiri
53
+ rake
54
+ rspec
55
+
56
+ BUNDLED WITH
57
+ 1.12.5
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2016 5rabbits
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. I
16
+ N NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19
+ DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,49 @@
1
+ # Gaspar PDF
2
+
3
+ Parses PDF tables into HTML / Json / Xml / CSV files without losing data. This gem uses [pdf-table-extract](https://github.com/ashima/pdf-table-extract).
4
+
5
+ ![Hay PDF, Hay Tabla](https://cloud.githubusercontent.com/assets/445798/17439517/82155610-5af6-11e6-9a3e-cfb0a019b1a1.jpg)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'gaspar-pdf'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install gaspar-pdf
20
+
21
+ ## Usage
22
+
23
+ You need to install [pdf-table-extract](https://github.com/5rabbits/pdf-table-extract/releases) on your system to use this gem.
24
+
25
+ ```ruby
26
+ require 'gaspar'
27
+
28
+ # Parse document.pdf to document.html
29
+ # This requires that the pdf-table-extract command is present in your PATH.
30
+
31
+ Gaspar.parse('document.pdf', 'document.html', {
32
+ page: 1, format: 'table_html'
33
+ })
34
+
35
+ # Available options:
36
+ # page - page to parse
37
+ # format - the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
38
+
39
+ ```
40
+
41
+ Inspired by [Kristin](https://github.com/ricn/kristin)
42
+
43
+ ## Contributing
44
+
45
+ 1. Fork it
46
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
47
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
48
+ 4. Push to the branch (`git push origin my-new-feature`)
49
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gaspar/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'gaspar-pdf'
8
+ spec.version = Gaspar::VERSION
9
+ spec.authors = ['5rabbits', 'Abraham Barrera']
10
+ spec.email = ['abarrerac@gmail.com']
11
+ spec.description = 'Parses PDF tables into HTML, JSON, XML and more.'
12
+ spec.summary = 'Parses PDF tables into HTML, JSON, XML and more.'
13
+ spec.homepage = 'https://github.com/5rabbits/gaspar-pdf'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($RS)
17
+
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_dependency 'spoon'
23
+ spec.add_dependency 'pdf-reader'
24
+
25
+ spec.add_development_dependency 'bundler'
26
+ spec.add_development_dependency 'rake'
27
+ spec.add_development_dependency 'nokogiri'
28
+ spec.add_development_dependency 'rspec'
29
+ end
@@ -0,0 +1,3 @@
1
+ module Gaspar
2
+ VERSION = '0.0.1'
3
+ end
data/lib/gaspar.rb ADDED
@@ -0,0 +1,119 @@
1
+ require 'gaspar/version'
2
+ require 'open-uri'
3
+ require 'net/http'
4
+ require 'spoon'
5
+
6
+ # Gaspar Gem
7
+ module Gaspar
8
+ # Parser
9
+ # This class parses a PDF into a machine-readable format
10
+ class Parser
11
+ def initialize(source, target, options = {})
12
+ @source = source
13
+ @target = target
14
+ @options = options
15
+ end
16
+
17
+ def parse
18
+ unless command_available?
19
+ io_error 'Can\'t find pdf-table-extract executable in PATH'
20
+ end
21
+
22
+ src = determine_source(@source)
23
+ opts = process_options(src).split(' ')
24
+ args = [extract_command, opts].flatten
25
+
26
+ pid = Spoon.spawnp(*args)
27
+ Process.waitpid(pid)
28
+
29
+ io_error("Could not parse #{src}") unless $?.exitstatus.zero?
30
+ end
31
+
32
+ private
33
+
34
+ def io_error(error_message)
35
+ raise IOError, error_message
36
+ end
37
+
38
+ def process_options(source)
39
+ opts = []
40
+ opts.push("-i #{source}") if source
41
+ opts.push("-o #{@target}") if @target
42
+ opts.push("-p #{@options[:page]}") if @options[:page]
43
+ opts.push("-t #{@options[:format]}") if @options[:format]
44
+
45
+ opts.join(' ')
46
+ end
47
+
48
+ def command_available?
49
+ extract_command
50
+ end
51
+
52
+ def extract_command
53
+ 'pdf-table-extract' if which('pdf-table-extract')
54
+ end
55
+
56
+ def which(cmd)
57
+ exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
58
+ ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
59
+ exts.each do |ext|
60
+ exe = File.join(path, "#{cmd}#{ext}")
61
+ return exe if File.executable? exe
62
+ end
63
+ end
64
+ nil
65
+ end
66
+
67
+ def random_source_name
68
+ rand(16**16).to_s(16)
69
+ end
70
+
71
+ def download_file(source)
72
+ tmp_file = "/tmp/#{random_source_name}.pdf"
73
+ File.open(tmp_file, 'wb') do |saved_file|
74
+ open(URI.encode(source), 'rb') do |read_file|
75
+ saved_file.write(read_file.read)
76
+ end
77
+ end
78
+
79
+ tmp_file
80
+ end
81
+
82
+ def determine_source(source)
83
+ is_file = File.exist?(source) && !File.directory?(source)
84
+ is_http = URI(source).scheme == 'http'
85
+ is_https = URI(source).scheme == 'https'
86
+
87
+ unless is_file || is_http || is_https
88
+ raise IOError, "Source (#{source}) is neither a file nor an URL."
89
+ end
90
+
91
+ is_file ? source : download_file(source)
92
+ end
93
+ end
94
+
95
+ # Read infor from PDF file usin pdf-reader
96
+ class Reader
97
+ def initialize(source)
98
+ @reader = PDF::Reader.new(source)
99
+ end
100
+
101
+ def metadata
102
+ @reader.metadata
103
+ end
104
+
105
+ def info
106
+ @reader.info
107
+ end
108
+
109
+ def page_count
110
+ @reader.page_count
111
+ end
112
+ end
113
+
114
+ # options[:type]
115
+ # {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
116
+ def self.parse(source, target, options = {})
117
+ Parser.new(source, target, options).parse
118
+ end
119
+ end
Binary file
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe Gaspar do
4
+ before(:all) do
5
+ @table_pdf = file_path("table.pdf")
6
+ @target_path = "#{Dir::tmpdir}"
7
+ @table_html = @target_path + "/output.html"
8
+ end
9
+
10
+
11
+ describe '#parse' do
12
+ describe 'with wrong params' do
13
+ it 'should raise error if source file does not exists' do
14
+ c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
15
+ expect { c.parse }.to raise_error(IOError)
16
+ end
17
+
18
+ it 'should raise error if source is not file nor url' do
19
+ c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
20
+ expect { c.parse }.to raise_error(URI::InvalidURIError)
21
+ end
22
+ end
23
+
24
+ describe 'with write params' do
25
+ it "should be possible to specify one page" do
26
+ Gaspar::Parser.new(@table_pdf, @table_html, {
27
+ page: 2, format: 'table_html'
28
+ }).parse
29
+
30
+ doc = Nokogiri::HTML(File.open(@table_html))
31
+ expect(doc.search('//comment()').text).not_to be_empty
32
+ end
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,13 @@
1
+ require 'gaspar'
2
+ require 'nokogiri'
3
+ require 'tmpdir'
4
+
5
+ def file_path(*paths)
6
+ File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
7
+ end
8
+
9
+ RSpec.configure do |config|
10
+ config.run_all_when_everything_filtered = true
11
+ config.filter_run :focus
12
+ config.order = 'random'
13
+ end
metadata ADDED
@@ -0,0 +1,145 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gaspar-pdf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - 5rabbits
8
+ - Abraham Barrera
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2016-08-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: spoon
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: pdf-reader
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: bundler
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: nokogiri
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: rspec
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ description: Parses PDF tables into HTML, JSON, XML and more.
99
+ email:
100
+ - abarrerac@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - Gemfile
108
+ - Gemfile.lock
109
+ - LICENSE.txt
110
+ - README.md
111
+ - Rakefile
112
+ - gaspar-pdf.gemspec
113
+ - lib/gaspar.rb
114
+ - lib/gaspar/version.rb
115
+ - spec/fixtures/table.pdf
116
+ - spec/gaspar_spec.rb
117
+ - spec/spec_helper.rb
118
+ homepage: https://github.com/5rabbits/gaspar-pdf
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.4.5
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Parses PDF tables into HTML, JSON, XML and more.
142
+ test_files:
143
+ - spec/fixtures/table.pdf
144
+ - spec/gaspar_spec.rb
145
+ - spec/spec_helper.rb