gaspar-pdf 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 61dc07bbed984fb395fe3e6de603f862906c853a
4
+ data.tar.gz: b1130b45efc157db8a9c0ccc6a6f768d71b6e159
5
+ SHA512:
6
+ metadata.gz: 1aa7157bf22ca40cb902fa894a17ac329e52d002b6387fe9fa0850537ab205b4063ad3e6e77a919e1d4e10461c8dc02eabdee733811a4e4481930be54a6fd989
7
+ data.tar.gz: b4aeb3d4ade931275b57d2924ae1639257634d98460bdfffe85c34a3e7b921d9e4dd7d18fb0f61cd8d17f0540de13875bb097ae582566ce2c49a53459ffee998
data/.gitignore ADDED
@@ -0,0 +1,51 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
51
+ .DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,57 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gaspar-pdf (0.0.1)
5
+ pdf-reader
6
+ spoon
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ Ascii85 (1.0.2)
12
+ afm (0.2.2)
13
+ diff-lcs (1.2.5)
14
+ ffi (1.9.14)
15
+ hashery (2.1.2)
16
+ mini_portile2 (2.1.0)
17
+ nokogiri (1.6.8)
18
+ mini_portile2 (~> 2.1.0)
19
+ pkg-config (~> 1.1.7)
20
+ pdf-reader (1.4.0)
21
+ Ascii85 (~> 1.0.0)
22
+ afm (~> 0.2.1)
23
+ hashery (~> 2.0)
24
+ ruby-rc4
25
+ ttfunk
26
+ pkg-config (1.1.7)
27
+ rake (11.2.2)
28
+ rspec (3.5.0)
29
+ rspec-core (~> 3.5.0)
30
+ rspec-expectations (~> 3.5.0)
31
+ rspec-mocks (~> 3.5.0)
32
+ rspec-core (3.5.2)
33
+ rspec-support (~> 3.5.0)
34
+ rspec-expectations (3.5.0)
35
+ diff-lcs (>= 1.2.0, < 2.0)
36
+ rspec-support (~> 3.5.0)
37
+ rspec-mocks (3.5.0)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.5.0)
40
+ rspec-support (3.5.0)
41
+ ruby-rc4 (0.1.5)
42
+ spoon (0.0.4)
43
+ ffi
44
+ ttfunk (1.4.0)
45
+
46
+ PLATFORMS
47
+ ruby
48
+
49
+ DEPENDENCIES
50
+ bundler
51
+ gaspar-pdf!
52
+ nokogiri
53
+ rake
54
+ rspec
55
+
56
+ BUNDLED WITH
57
+ 1.12.5
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2016 5rabbits
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. I
16
+ N NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19
+ DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,49 @@
1
+ # Gaspar PDF
2
+
3
+ Parses PDF tables into HTML / Json / Xml / CSV files without losing data. This gem uses [pdf-table-extract](https://github.com/ashima/pdf-table-extract).
4
+
5
+ ![Hay PDF, Hay Tabla](https://cloud.githubusercontent.com/assets/445798/17439517/82155610-5af6-11e6-9a3e-cfb0a019b1a1.jpg)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'gaspar-pdf'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install gaspar-pdf
20
+
21
+ ## Usage
22
+
23
+ You need to install [pdf-table-extract](https://github.com/5rabbits/pdf-table-extract/releases) on your system to use this gem.
24
+
25
+ ```ruby
26
+ require 'gaspar'
27
+
28
+ # Parse document.pdf to document.html
29
+ # This requires that the pdf-table-extract command is present in your PATH.
30
+
31
+ Gaspar.parse('document.pdf', 'document.html', {
32
+ page: 1, format: 'table_html'
33
+ })
34
+
35
+ # Available options:
36
+ # page - page to parse
37
+ # format - the type of output: [cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list]
38
+
39
+ ```
40
+
41
+ Inspired by [Kristin](https://github.com/ricn/kristin)
42
+
43
+ ## Contributing
44
+
45
+ 1. Fork it
46
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
47
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
48
+ 4. Push to the branch (`git push origin my-new-feature`)
49
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gaspar/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'gaspar-pdf'
8
+ spec.version = Gaspar::VERSION
9
+ spec.authors = ['5rabbits', 'Abraham Barrera']
10
+ spec.email = ['abarrerac@gmail.com']
11
+ spec.description = 'Parses PDF tables into HTML, JSON, XML and more.'
12
+ spec.summary = 'Parses PDF tables into HTML, JSON, XML and more.'
13
+ spec.homepage = 'https://github.com/5rabbits/gaspar-pdf'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($RS)
17
+
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_dependency 'spoon'
23
+ spec.add_dependency 'pdf-reader'
24
+
25
+ spec.add_development_dependency 'bundler'
26
+ spec.add_development_dependency 'rake'
27
+ spec.add_development_dependency 'nokogiri'
28
+ spec.add_development_dependency 'rspec'
29
+ end
@@ -0,0 +1,3 @@
1
+ module Gaspar
2
+ VERSION = '0.0.1'
3
+ end
data/lib/gaspar.rb ADDED
@@ -0,0 +1,119 @@
1
+ require 'gaspar/version'
2
+ require 'open-uri'
3
+ require 'net/http'
4
+ require 'spoon'
5
+
6
+ # Gaspar Gem
7
+ module Gaspar
8
+ # Parser
9
+ # This class parses a PDF into a machine-readable format
10
+ class Parser
11
+ def initialize(source, target, options = {})
12
+ @source = source
13
+ @target = target
14
+ @options = options
15
+ end
16
+
17
+ def parse
18
+ unless command_available?
19
+ io_error 'Can\'t find pdf-table-extract executable in PATH'
20
+ end
21
+
22
+ src = determine_source(@source)
23
+ opts = process_options(src).split(' ')
24
+ args = [extract_command, opts].flatten
25
+
26
+ pid = Spoon.spawnp(*args)
27
+ Process.waitpid(pid)
28
+
29
+ io_error("Could not parse #{src}") unless $?.exitstatus.zero?
30
+ end
31
+
32
+ private
33
+
34
+ def io_error(error_message)
35
+ raise IOError, error_message
36
+ end
37
+
38
+ def process_options(source)
39
+ opts = []
40
+ opts.push("-i #{source}") if source
41
+ opts.push("-o #{@target}") if @target
42
+ opts.push("-p #{@options[:page]}") if @options[:page]
43
+ opts.push("-t #{@options[:format]}") if @options[:format]
44
+
45
+ opts.join(' ')
46
+ end
47
+
48
+ def command_available?
49
+ extract_command
50
+ end
51
+
52
+ def extract_command
53
+ 'pdf-table-extract' if which('pdf-table-extract')
54
+ end
55
+
56
+ def which(cmd)
57
+ exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
58
+ ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
59
+ exts.each do |ext|
60
+ exe = File.join(path, "#{cmd}#{ext}")
61
+ return exe if File.executable? exe
62
+ end
63
+ end
64
+ nil
65
+ end
66
+
67
+ def random_source_name
68
+ rand(16**16).to_s(16)
69
+ end
70
+
71
+ def download_file(source)
72
+ tmp_file = "/tmp/#{random_source_name}.pdf"
73
+ File.open(tmp_file, 'wb') do |saved_file|
74
+ open(URI.encode(source), 'rb') do |read_file|
75
+ saved_file.write(read_file.read)
76
+ end
77
+ end
78
+
79
+ tmp_file
80
+ end
81
+
82
+ def determine_source(source)
83
+ is_file = File.exist?(source) && !File.directory?(source)
84
+ is_http = URI(source).scheme == 'http'
85
+ is_https = URI(source).scheme == 'https'
86
+
87
+ unless is_file || is_http || is_https
88
+ raise IOError, "Source (#{source}) is neither a file nor an URL."
89
+ end
90
+
91
+ is_file ? source : download_file(source)
92
+ end
93
+ end
94
+
95
+ # Read infor from PDF file usin pdf-reader
96
+ class Reader
97
+ def initialize(source)
98
+ @reader = PDF::Reader.new(source)
99
+ end
100
+
101
+ def metadata
102
+ @reader.metadata
103
+ end
104
+
105
+ def info
106
+ @reader.info
107
+ end
108
+
109
+ def page_count
110
+ @reader.page_count
111
+ end
112
+ end
113
+
114
+ # options[:type]
115
+ # {cells_csv,cells_json,cells_xml,table_csv,table_html,table_chtml,table_list}
116
+ def self.parse(source, target, options = {})
117
+ Parser.new(source, target, options).parse
118
+ end
119
+ end
Binary file
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe Gaspar do
4
+ before(:all) do
5
+ @table_pdf = file_path("table.pdf")
6
+ @target_path = "#{Dir::tmpdir}"
7
+ @table_html = @target_path + "/output.html"
8
+ end
9
+
10
+
11
+ describe '#parse' do
12
+ describe 'with wrong params' do
13
+ it 'should raise error if source file does not exists' do
14
+ c = Gaspar::Parser.new('unknown.pdf', 'unknown.html')
15
+ expect { c.parse }.to raise_error(IOError)
16
+ end
17
+
18
+ it 'should raise error if source is not file nor url' do
19
+ c = Gaspar::Parser.new('http:// /.pdf', 'unknown.html')
20
+ expect { c.parse }.to raise_error(URI::InvalidURIError)
21
+ end
22
+ end
23
+
24
+ describe 'with write params' do
25
+ it "should be possible to specify one page" do
26
+ Gaspar::Parser.new(@table_pdf, @table_html, {
27
+ page: 2, format: 'table_html'
28
+ }).parse
29
+
30
+ doc = Nokogiri::HTML(File.open(@table_html))
31
+ expect(doc.search('//comment()').text).not_to be_empty
32
+ end
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,13 @@
1
+ require 'gaspar'
2
+ require 'nokogiri'
3
+ require 'tmpdir'
4
+
5
+ def file_path(*paths)
6
+ File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
7
+ end
8
+
9
+ RSpec.configure do |config|
10
+ config.run_all_when_everything_filtered = true
11
+ config.filter_run :focus
12
+ config.order = 'random'
13
+ end
metadata ADDED
@@ -0,0 +1,145 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gaspar-pdf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - 5rabbits
8
+ - Abraham Barrera
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2016-08-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: spoon
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: pdf-reader
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: bundler
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: nokogiri
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: rspec
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ description: Parses PDF tables into HTML, JSON, XML and more.
99
+ email:
100
+ - abarrerac@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - Gemfile
108
+ - Gemfile.lock
109
+ - LICENSE.txt
110
+ - README.md
111
+ - Rakefile
112
+ - gaspar-pdf.gemspec
113
+ - lib/gaspar.rb
114
+ - lib/gaspar/version.rb
115
+ - spec/fixtures/table.pdf
116
+ - spec/gaspar_spec.rb
117
+ - spec/spec_helper.rb
118
+ homepage: https://github.com/5rabbits/gaspar-pdf
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.4.5
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Parses PDF tables into HTML, JSON, XML and more.
142
+ test_files:
143
+ - spec/fixtures/table.pdf
144
+ - spec/gaspar_spec.rb
145
+ - spec/spec_helper.rb