doc_ripper 0.0.7.2 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc_ripper.gemspec +3 -0
- data/lib/doc_ripper.rb +34 -11
- data/lib/doc_ripper/base.rb +7 -7
- data/lib/doc_ripper/formats/docx_ripper.rb +6 -2
- data/lib/doc_ripper/formats/ms_doc_ripper.rb +2 -2
- data/lib/doc_ripper/formats/pdf_ripper.rb +2 -2
- data/lib/doc_ripper/formats/sketch_ripper.rb +3 -6
- data/lib/doc_ripper/formats/text_ripper.rb +13 -0
- data/lib/doc_ripper/version.rb +1 -3
- data/pkg/doc_ripper-0.0.7.2.gem +0 -0
- data/spec/doc_ripper/{formats/doc_ripper_spec.rb → doc_ripper_spec.rb} +32 -5
- data/spec/doc_ripper/formats/sketch_ripper_spec.rb +1 -1
- data/spec/fixtures/chinese.docx +0 -0
- data/spec/fixtures/lorem.txt +9 -0
- data/spec/spec_helper.rb +3 -1
- metadata +53 -10
- data/lib/doc_ripper/text_ripper.rb +0 -43
- data/spec/doc_ripper/base_spec.rb +0 -9
- data/spec/doc_ripper/formats/text_ripper_spec.rb +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48d68fcd85b60d3ab64df91710c31158a998dc1a
|
4
|
+
data.tar.gz: 0aba15d63c943c4f33f5dbe14503907a72744a07
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 104082f0efdf157273abda2e201b55cb315fe861005e4a14d358a09297a2f2b7b5d98e04fdd371196f85c6325f841ad82932bb33c123cc9432b35e9c5f45d572
|
7
|
+
data.tar.gz: 2f1b2d0c81f45ed03e197b0a0a2eaa41a057629f8c5c32f2107dc09233e67f12a141e5a19fa53ab8e97dd434e66f7fee18cddf1aa6c81bf600e974c9440383de
|
data/doc_ripper.gemspec
CHANGED
@@ -27,4 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_development_dependency "bundler", "~> 1.6"
|
28
28
|
spec.add_development_dependency "rake", "~> 10.0"
|
29
29
|
spec.add_development_dependency "rspec"
|
30
|
+
spec.add_development_dependency "sqlite3"
|
31
|
+
spec.add_development_dependency "rubocop"
|
32
|
+
spec.add_development_dependency "pry"
|
30
33
|
end
|
data/lib/doc_ripper.rb
CHANGED
@@ -1,23 +1,46 @@
|
|
1
1
|
require 'shellwords'
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
2
|
+
require 'doc_ripper/version'
|
3
|
+
require 'doc_ripper/base'
|
4
|
+
require 'doc_ripper/formats/text_ripper'
|
5
|
+
require 'doc_ripper/formats/pdf_ripper'
|
6
|
+
require 'doc_ripper/formats/docx_ripper'
|
7
|
+
require 'doc_ripper/formats/ms_doc_ripper'
|
8
|
+
require 'doc_ripper/formats/sketch_ripper'
|
9
|
+
require 'doc_ripper/exceptions'
|
10
10
|
|
11
11
|
module DocRipper
|
12
12
|
class << self
|
13
13
|
def rip(path, options = {})
|
14
|
-
|
14
|
+
ripper = choose_ripper(path)
|
15
|
+
ripper.text unless ripper.nil?
|
15
16
|
end
|
16
17
|
|
17
18
|
def rip!(path)
|
18
|
-
|
19
|
+
ripper = choose_ripper(path)
|
20
|
+
raise(UnsupportedFileType) if ripper.nil?
|
19
21
|
|
20
|
-
text || raise(FileNotFound)
|
22
|
+
ripper.text || raise(FileNotFound)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def choose_ripper(file_path)
|
28
|
+
ripper = begin
|
29
|
+
case
|
30
|
+
when !!(file_path =~ /.docx$/i)
|
31
|
+
ripper = Formats::DocxRipper.new(file_path)
|
32
|
+
when !!(file_path =~ /.doc$/i)
|
33
|
+
ripper = Formats::MsDocRipper.new(file_path)
|
34
|
+
when !!(file_path =~ /.pdf$/i)
|
35
|
+
ripper = Formats::PdfRipper.new(file_path)
|
36
|
+
when !!(file_path =~ /.sketch$/i)
|
37
|
+
ripper = Formats::SketchRipper.new(file_path)
|
38
|
+
when !!(file_path =~ /.txt$/i)
|
39
|
+
ripper = Formats::TextRipper.new(file_path)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
ripper
|
21
44
|
end
|
22
45
|
end
|
23
46
|
end
|
data/lib/doc_ripper/base.rb
CHANGED
@@ -2,16 +2,16 @@ module DocRipper
|
|
2
2
|
module Ripper
|
3
3
|
|
4
4
|
class Base
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :file_path
|
6
6
|
|
7
|
-
def initialize(file_path
|
7
|
+
def initialize(file_path)
|
8
|
+
file_parts = file_path.split('.')
|
8
9
|
@file_path = file_path
|
9
|
-
@
|
10
|
-
@options = options
|
10
|
+
@extension = file_parts.last
|
11
11
|
end
|
12
12
|
|
13
|
-
def
|
14
|
-
|
13
|
+
def text
|
14
|
+
@text ||= rip
|
15
15
|
end
|
16
16
|
|
17
17
|
private
|
@@ -22,4 +22,4 @@ module DocRipper
|
|
22
22
|
end
|
23
23
|
|
24
24
|
end
|
25
|
-
end
|
25
|
+
end
|
@@ -3,9 +3,13 @@ module DocRipper
|
|
3
3
|
class DocxRipper < Ripper::Base
|
4
4
|
|
5
5
|
def rip
|
6
|
-
@text ||=
|
6
|
+
@text ||= begin
|
7
|
+
text = %x(unzip -p #{to_shell(file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$')
|
8
|
+
|
9
|
+
text.empty? ? nil : text
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
13
|
end
|
10
14
|
end
|
11
|
-
end
|
15
|
+
end
|
@@ -4,6 +4,7 @@ require 'colored'
|
|
4
4
|
SQLITE_LOAD_WARNING = 'SQLite3 optional dependency not found. Sketch files are not supported in this mode.'.yellow
|
5
5
|
|
6
6
|
begin
|
7
|
+
# gem 'sqlite3'
|
7
8
|
require 'sqlite3'
|
8
9
|
rescue LoadError
|
9
10
|
warn SQLITE_LOAD_WARNING
|
@@ -45,17 +46,13 @@ module DocRipper
|
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
48
|
-
def initialize(file_path
|
49
|
+
def initialize(file_path)
|
49
50
|
raise Sqlite3NotFound if !defined?(SQLite3)
|
50
51
|
super
|
51
52
|
end
|
52
53
|
|
53
|
-
def read_type
|
54
|
-
:mem
|
55
|
-
end
|
56
|
-
|
57
54
|
def rip
|
58
|
-
db = SQLite3::Database.new(
|
55
|
+
db = SQLite3::Database.new(file_path)
|
59
56
|
data = db.execute("SELECT value FROM payload").flatten.first
|
60
57
|
@text ||= text_objects(data).join(" ").strip
|
61
58
|
end
|
data/lib/doc_ripper/version.rb
CHANGED
Binary file
|
@@ -10,8 +10,38 @@ module DocRipper
|
|
10
10
|
let(:missing_path) { "#{
|
11
11
|
FIXTURE_PATH}some_missing_path.docx" }
|
12
12
|
|
13
|
-
|
13
|
+
describe 'full utf-8 encoding' do
|
14
|
+
let(:result) { DocRipper.rip(file_path) }
|
15
|
+
|
16
|
+
context 'txt file' do
|
17
|
+
let(:str) { '¿Cuál es su nombre?' }
|
18
|
+
let(:file_path) { "#{FIXTURE_PATH}encoding_sample.txt" }
|
19
|
+
|
20
|
+
before(:each) do
|
21
|
+
File.write(file_path, str)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'maintains encoding' do
|
25
|
+
expect(result).to eq(str)
|
26
|
+
end
|
27
|
+
|
28
|
+
after(:each) do
|
29
|
+
File.delete(file_path)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe 'docx file' do
|
34
|
+
let(:str) { '四、我们确认,我们完全同意招标文件制定的投标规则,并承诺按照这些规则履行我们的所有义务,包括一旦投标文件被贵方接受,将履行社会资本合作方的义务' }
|
35
|
+
let(:file_path) { "#{FIXTURE_PATH}chinese.docx" }
|
14
36
|
|
37
|
+
it 'maintains encoding' do
|
38
|
+
puts result.encoding
|
39
|
+
expect(result).to include(str)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context '#rip' do
|
15
45
|
it 'should respond to #rip' do
|
16
46
|
expect(DocRipper.respond_to? :rip).to eq(true)
|
17
47
|
end
|
@@ -34,7 +64,6 @@ module DocRipper
|
|
34
64
|
it 'should remove the dumped text version of the file' do
|
35
65
|
|
36
66
|
end
|
37
|
-
|
38
67
|
end
|
39
68
|
|
40
69
|
context '#rip!' do
|
@@ -51,7 +80,5 @@ module DocRipper
|
|
51
80
|
end
|
52
81
|
|
53
82
|
end
|
54
|
-
|
55
|
-
|
56
83
|
end
|
57
|
-
end
|
84
|
+
end
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
|
3
|
+
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim
|
4
|
+
veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea
|
5
|
+
commodo consequat. Duis aute irure dolor in reprehenderit in voluptate
|
6
|
+
velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat
|
7
|
+
cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id
|
8
|
+
est laborum.
|
9
|
+
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_ripper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Zaich
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: CFPropertyList
|
@@ -80,6 +80,48 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: sqlite3
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: pry
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
83
125
|
description: Scrape text from common file formats (.pdf,.doc,.docx, .sketch, .txt)
|
84
126
|
with a single convenient command.
|
85
127
|
email:
|
@@ -102,20 +144,21 @@ files:
|
|
102
144
|
- lib/doc_ripper/formats/ms_doc_ripper.rb
|
103
145
|
- lib/doc_ripper/formats/pdf_ripper.rb
|
104
146
|
- lib/doc_ripper/formats/sketch_ripper.rb
|
105
|
-
- lib/doc_ripper/text_ripper.rb
|
147
|
+
- lib/doc_ripper/formats/text_ripper.rb
|
106
148
|
- lib/doc_ripper/version.rb
|
107
149
|
- pkg/doc_ripper-0.0.5.gem
|
108
150
|
- pkg/doc_ripper-0.0.6.gem
|
109
151
|
- pkg/doc_ripper-0.0.7.1.gem
|
152
|
+
- pkg/doc_ripper-0.0.7.2.gem
|
110
153
|
- pkg/doc_ripper-0.0.7.gem
|
111
|
-
- spec/doc_ripper/
|
112
|
-
- spec/doc_ripper/formats/doc_ripper_spec.rb
|
154
|
+
- spec/doc_ripper/doc_ripper_spec.rb
|
113
155
|
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
114
|
-
- spec/
|
156
|
+
- spec/fixtures/chinese.docx
|
115
157
|
- spec/fixtures/complex_sketch_text.sketch
|
116
158
|
- spec/fixtures/lorem.doc
|
117
159
|
- spec/fixtures/lorem.docx
|
118
160
|
- spec/fixtures/lorem.pdf
|
161
|
+
- spec/fixtures/lorem.txt
|
119
162
|
- spec/fixtures/missing_file.txt
|
120
163
|
- spec/fixtures/simple_sketch_text.sketch
|
121
164
|
- spec/fixtures/some_missing_path.txt
|
@@ -142,19 +185,19 @@ requirements:
|
|
142
185
|
- Antiword
|
143
186
|
- pdftotext/poppler
|
144
187
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.6.
|
188
|
+
rubygems_version: 2.6.14
|
146
189
|
signing_key:
|
147
190
|
specification_version: 4
|
148
191
|
summary: Rip out text from pdf, doc and docx formats
|
149
192
|
test_files:
|
150
|
-
- spec/doc_ripper/
|
151
|
-
- spec/doc_ripper/formats/doc_ripper_spec.rb
|
193
|
+
- spec/doc_ripper/doc_ripper_spec.rb
|
152
194
|
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
153
|
-
- spec/
|
195
|
+
- spec/fixtures/chinese.docx
|
154
196
|
- spec/fixtures/complex_sketch_text.sketch
|
155
197
|
- spec/fixtures/lorem.doc
|
156
198
|
- spec/fixtures/lorem.docx
|
157
199
|
- spec/fixtures/lorem.pdf
|
200
|
+
- spec/fixtures/lorem.txt
|
158
201
|
- spec/fixtures/missing_file.txt
|
159
202
|
- spec/fixtures/simple_sketch_text.sketch
|
160
203
|
- spec/fixtures/some_missing_path.txt
|
@@ -1,43 +0,0 @@
|
|
1
|
-
#encoding: UTF-8
|
2
|
-
|
3
|
-
module DocRipper
|
4
|
-
class TextRipper < Ripper::Base
|
5
|
-
attr_reader :text_file_path, :file_path
|
6
|
-
|
7
|
-
def ripped?
|
8
|
-
@is_ripped ||=choose_ripper
|
9
|
-
end
|
10
|
-
|
11
|
-
def text
|
12
|
-
if ripped? && @ripper.read_type == :file
|
13
|
-
@text = IO.read(@text_file_path).force_encoding("ISO-8859-1").encode("utf-8", replace: nil)
|
14
|
-
File.delete(@text_file_path)
|
15
|
-
|
16
|
-
elsif ripped? && @ripper.read_type == :mem
|
17
|
-
@text = @ripper.text
|
18
|
-
end
|
19
|
-
|
20
|
-
@text
|
21
|
-
end
|
22
|
-
|
23
|
-
private
|
24
|
-
|
25
|
-
def choose_ripper
|
26
|
-
case
|
27
|
-
when !!(@file_path =~ /.docx$/i)
|
28
|
-
@ripper = Formats::DocxRipper.new(@file_path)
|
29
|
-
when !!(@file_path =~ /.doc$/i)
|
30
|
-
@ripper = Formats::MsDocRipper.new(@file_path)
|
31
|
-
when !!(@file_path =~ /.pdf$/i)
|
32
|
-
@ripper = Formats::PdfRipper.new(@file_path)
|
33
|
-
when !!(@file_path =~ /.sketch$/i)
|
34
|
-
@ripper = Formats::SketchRipper.new(@file_path)
|
35
|
-
when @options[:raise]
|
36
|
-
raise UnsupportedFileType
|
37
|
-
end
|
38
|
-
|
39
|
-
@ripper.rip
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
end
|