doc_ripper 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OTJjZmJlZjQwMzQ2ZDFlMWQwMzIyY2UyNGJmYTA1NWQxNGJmODEyZA==
5
+ data.tar.gz: !binary |-
6
+ MWIxZWYzZmIwZWE5Yjk5MTg0N2RkNWVjNTA1ODYxYzg4NWM5NTkzYw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ MDczOTMxNDI1ZWFlYmQzM2JlMTU0YjJlYjEwNGQ5ZjM4M2I0ZmYxMWRhOTNl
10
+ YTkxN2UyY2ZhZWVhYmE3ZWYyNmZlODA3MWJjN2M1ZDI1MjZjMjdhZmQ3ODE0
11
+ NDIxYzA0ZTA4MWFhOTRhNjcxY2U0NmVhMDM3MGZkN2NiOWY0OTg=
12
+ data.tar.gz: !binary |-
13
+ OTA0NmU4ZDliOWY1MjVjYmYzMTJjNjJhZmI4YzUyZWQyYTg1ZDNhMzM0Y2Zm
14
+ MTZhNGI2NDMzMjE0MGVjN2EwMDE2YWRjNjYzNTAxYWVlZmU3ZGNhZjYzNWE3
15
+ Y2M5NzU2ZTliNjIwNDI4MzNlN2ZkNjYxM2I3YTRhMzA3Y2FlNjk=
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in doc_ripper.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Paul Zaich
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,37 @@
1
+ DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
2
+
3
+ For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion. I found
4
+
5
+ Need OCR support or in-image text parsing? Take a look at [Docsplit](https://github.com/documentcloud/docsplit).
6
+
7
+ ## Quickstart
8
+
9
+ ```
10
+ gem install doc_ripper
11
+ ```
12
+ ### Specify a file to parse
13
+
14
+ ```
15
+ DocRipper::TextRipper.new('/path/to/file')
16
+ ```
17
+
18
+ ### Return the file's text
19
+ ```
20
+ dr = DocRipper::TextRipper.new('/path/to/file')
21
+ dr.text
22
+ => "Document's text"
23
+ ```
24
+
25
+ If the file cannot be read, nil will be returned.
26
+
27
+ ```
28
+ dr = DocRipper::TextRipper.new('/path/to/missing/file')
29
+ dr.text
30
+ => nil
31
+ ```
32
+
33
+
34
+ ## Dependencies
35
+ - Ruby version >= 1.9.2
36
+ - [Poppler-utils/(pdftotext)](http://poppler.freedesktop.org/) (PDF)
37
+ - [Antiword](http://www.winfield.demon.nl/) (docx) more info: http://linux.die.net/man/1/antiword
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'doc_ripper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "doc_ripper"
8
+ spec.version = DocRipper::VERSION
9
+ spec.authors = ["Paul Zaich"]
10
+ spec.email = ["pzaich@gmail.com"]
11
+ spec.summary = %q{Rip out text from pdf, doc and docx formats}
12
+ spec.description = %q{Provides a lean, convenient ruby wrapper to poppler, and antiword command line tools to quickly rip out text from common text formats.}
13
+ spec.homepage = "https://github.com/pzaich/doc_ripper"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.requirements << 'Antiword'
22
+ spec.requirements << "pdftotext/poppler"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.6"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ end
@@ -0,0 +1,11 @@
1
+ require 'shellwords'
2
+ require "doc_ripper/version"
3
+ require "doc_ripper/ripper/base"
4
+ require "doc_ripper/text_ripper"
5
+ require "doc_ripper/pdf_ripper"
6
+ require "doc_ripper/docx_ripper"
7
+ require "doc_ripper/ms_doc_ripper"
8
+
9
+ module DocRipper
10
+
11
+ end
@@ -0,0 +1,9 @@
1
+ module DocRipper
2
+ class DocxRipper < Ripper::Base
3
+
4
+ def rip
5
+ @text ||= system(%Q[ unzip -p #{to_shell(@file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$' > #{to_shell(@text_file_path)} ])
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module DocRipper
2
+ class MsDocRipper < Ripper::Base
3
+
4
+ def rip
5
+ @text ||= system(%Q[ antiword #{to_shell(@file_path)} > #{to_shell(@text_file_path)} ])
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module DocRipper
2
+ class PdfRipper < Ripper::Base
3
+
4
+ def rip
5
+ @text ||= system(%Q[ pdftotext #{to_shell(@file_path)} > #{to_shell(@text_file_path)} ])
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,20 @@
1
+ module DocRipper
2
+ module Ripper
3
+
4
+ class Base
5
+ attr_reader :text
6
+
7
+ def initialize(file_path)
8
+ @file_path = file_path
9
+ @text_file_path = "#{file_path.split('.').first}.txt"
10
+ end
11
+
12
+ private
13
+
14
+ def to_shell(file_path)
15
+ Shellwords.escape(file_path)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,29 @@
1
+ #encoding: UTF-8
2
+
3
+ module DocRipper
4
+ class TextRipper < Ripper::Base
5
+ attr_reader :text_file_path, :file_path
6
+
7
+ def rip
8
+ @is_ripped ||=choose_ripper
9
+ end
10
+
11
+ def text
12
+ @text ||= IO.read(@text_file_path).force_encoding("ISO-8859-1").encode("utf-8", replace: nil) if rip
13
+ end
14
+
15
+ private
16
+
17
+ def choose_ripper
18
+ case
19
+ when !!(@file_path[-5.. -1] =~ /.docx/i)
20
+ DocxRipper.new(@file_path).rip
21
+ when !!(@file_path[-4.. -1] =~ /.doc/i)
22
+ MsDocRipper.new(@file_path).rip
23
+ when !!(@file_path[-4..-1] =~ /.pdf/i)
24
+ PdfRipper.new(@file_path).rip
25
+ end
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,5 @@
1
+ module DocRipper
2
+ VERSION = "0.0.4"
3
+ end
4
+
5
+
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: doc_ripper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Paul Zaich
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Provides a lean, convenient ruby wrapper to poppler, and antiword command
42
+ line tools to quickly rip out text from common text formats.
43
+ email:
44
+ - pzaich@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - .gitignore
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - doc_ripper.gemspec
55
+ - lib/doc_ripper.rb
56
+ - lib/doc_ripper/docx_ripper.rb
57
+ - lib/doc_ripper/ms_doc_ripper.rb
58
+ - lib/doc_ripper/pdf_ripper.rb
59
+ - lib/doc_ripper/ripper/base.rb
60
+ - lib/doc_ripper/text_ripper.rb
61
+ - lib/doc_ripper/version.rb
62
+ homepage: https://github.com/pzaich/doc_ripper
63
+ licenses:
64
+ - MIT
65
+ metadata: {}
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ! '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ! '>='
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements:
81
+ - Antiword
82
+ - pdftotext/poppler
83
+ rubyforge_project:
84
+ rubygems_version: 2.0.3
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Rip out text from pdf, doc and docx formats
88
+ test_files: []