doc_ripper 0.0.7.2 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc_ripper.gemspec +3 -0
- data/lib/doc_ripper.rb +34 -11
- data/lib/doc_ripper/base.rb +7 -7
- data/lib/doc_ripper/formats/docx_ripper.rb +6 -2
- data/lib/doc_ripper/formats/ms_doc_ripper.rb +2 -2
- data/lib/doc_ripper/formats/pdf_ripper.rb +2 -2
- data/lib/doc_ripper/formats/sketch_ripper.rb +3 -6
- data/lib/doc_ripper/formats/text_ripper.rb +13 -0
- data/lib/doc_ripper/version.rb +1 -3
- data/pkg/doc_ripper-0.0.7.2.gem +0 -0
- data/spec/doc_ripper/{formats/doc_ripper_spec.rb → doc_ripper_spec.rb} +32 -5
- data/spec/doc_ripper/formats/sketch_ripper_spec.rb +1 -1
- data/spec/fixtures/chinese.docx +0 -0
- data/spec/fixtures/lorem.txt +9 -0
- data/spec/spec_helper.rb +3 -1
- metadata +53 -10
- data/lib/doc_ripper/text_ripper.rb +0 -43
- data/spec/doc_ripper/base_spec.rb +0 -9
- data/spec/doc_ripper/formats/text_ripper_spec.rb +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48d68fcd85b60d3ab64df91710c31158a998dc1a
|
4
|
+
data.tar.gz: 0aba15d63c943c4f33f5dbe14503907a72744a07
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 104082f0efdf157273abda2e201b55cb315fe861005e4a14d358a09297a2f2b7b5d98e04fdd371196f85c6325f841ad82932bb33c123cc9432b35e9c5f45d572
|
7
|
+
data.tar.gz: 2f1b2d0c81f45ed03e197b0a0a2eaa41a057629f8c5c32f2107dc09233e67f12a141e5a19fa53ab8e97dd434e66f7fee18cddf1aa6c81bf600e974c9440383de
|
data/doc_ripper.gemspec
CHANGED
@@ -27,4 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_development_dependency "bundler", "~> 1.6"
|
28
28
|
spec.add_development_dependency "rake", "~> 10.0"
|
29
29
|
spec.add_development_dependency "rspec"
|
30
|
+
spec.add_development_dependency "sqlite3"
|
31
|
+
spec.add_development_dependency "rubocop"
|
32
|
+
spec.add_development_dependency "pry"
|
30
33
|
end
|
data/lib/doc_ripper.rb
CHANGED
@@ -1,23 +1,46 @@
|
|
1
1
|
require 'shellwords'
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
2
|
+
require 'doc_ripper/version'
|
3
|
+
require 'doc_ripper/base'
|
4
|
+
require 'doc_ripper/formats/text_ripper'
|
5
|
+
require 'doc_ripper/formats/pdf_ripper'
|
6
|
+
require 'doc_ripper/formats/docx_ripper'
|
7
|
+
require 'doc_ripper/formats/ms_doc_ripper'
|
8
|
+
require 'doc_ripper/formats/sketch_ripper'
|
9
|
+
require 'doc_ripper/exceptions'
|
10
10
|
|
11
11
|
module DocRipper
|
12
12
|
class << self
|
13
13
|
def rip(path, options = {})
|
14
|
-
|
14
|
+
ripper = choose_ripper(path)
|
15
|
+
ripper.text unless ripper.nil?
|
15
16
|
end
|
16
17
|
|
17
18
|
def rip!(path)
|
18
|
-
|
19
|
+
ripper = choose_ripper(path)
|
20
|
+
raise(UnsupportedFileType) if ripper.nil?
|
19
21
|
|
20
|
-
text || raise(FileNotFound)
|
22
|
+
ripper.text || raise(FileNotFound)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def choose_ripper(file_path)
|
28
|
+
ripper = begin
|
29
|
+
case
|
30
|
+
when !!(file_path =~ /.docx$/i)
|
31
|
+
ripper = Formats::DocxRipper.new(file_path)
|
32
|
+
when !!(file_path =~ /.doc$/i)
|
33
|
+
ripper = Formats::MsDocRipper.new(file_path)
|
34
|
+
when !!(file_path =~ /.pdf$/i)
|
35
|
+
ripper = Formats::PdfRipper.new(file_path)
|
36
|
+
when !!(file_path =~ /.sketch$/i)
|
37
|
+
ripper = Formats::SketchRipper.new(file_path)
|
38
|
+
when !!(file_path =~ /.txt$/i)
|
39
|
+
ripper = Formats::TextRipper.new(file_path)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
ripper
|
21
44
|
end
|
22
45
|
end
|
23
46
|
end
|
data/lib/doc_ripper/base.rb
CHANGED
@@ -2,16 +2,16 @@ module DocRipper
|
|
2
2
|
module Ripper
|
3
3
|
|
4
4
|
class Base
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :file_path
|
6
6
|
|
7
|
-
def initialize(file_path
|
7
|
+
def initialize(file_path)
|
8
|
+
file_parts = file_path.split('.')
|
8
9
|
@file_path = file_path
|
9
|
-
@
|
10
|
-
@options = options
|
10
|
+
@extension = file_parts.last
|
11
11
|
end
|
12
12
|
|
13
|
-
def
|
14
|
-
|
13
|
+
def text
|
14
|
+
@text ||= rip
|
15
15
|
end
|
16
16
|
|
17
17
|
private
|
@@ -22,4 +22,4 @@ module DocRipper
|
|
22
22
|
end
|
23
23
|
|
24
24
|
end
|
25
|
-
end
|
25
|
+
end
|
@@ -3,9 +3,13 @@ module DocRipper
|
|
3
3
|
class DocxRipper < Ripper::Base
|
4
4
|
|
5
5
|
def rip
|
6
|
-
@text ||=
|
6
|
+
@text ||= begin
|
7
|
+
text = %x(unzip -p #{to_shell(file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$')
|
8
|
+
|
9
|
+
text.empty? ? nil : text
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
13
|
end
|
10
14
|
end
|
11
|
-
end
|
15
|
+
end
|
@@ -4,6 +4,7 @@ require 'colored'
|
|
4
4
|
SQLITE_LOAD_WARNING = 'SQLite3 optional dependency not found. Sketch files are not supported in this mode.'.yellow
|
5
5
|
|
6
6
|
begin
|
7
|
+
# gem 'sqlite3'
|
7
8
|
require 'sqlite3'
|
8
9
|
rescue LoadError
|
9
10
|
warn SQLITE_LOAD_WARNING
|
@@ -45,17 +46,13 @@ module DocRipper
|
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
48
|
-
def initialize(file_path
|
49
|
+
def initialize(file_path)
|
49
50
|
raise Sqlite3NotFound if !defined?(SQLite3)
|
50
51
|
super
|
51
52
|
end
|
52
53
|
|
53
|
-
def read_type
|
54
|
-
:mem
|
55
|
-
end
|
56
|
-
|
57
54
|
def rip
|
58
|
-
db = SQLite3::Database.new(
|
55
|
+
db = SQLite3::Database.new(file_path)
|
59
56
|
data = db.execute("SELECT value FROM payload").flatten.first
|
60
57
|
@text ||= text_objects(data).join(" ").strip
|
61
58
|
end
|
data/lib/doc_ripper/version.rb
CHANGED
Binary file
|
@@ -10,8 +10,38 @@ module DocRipper
|
|
10
10
|
let(:missing_path) { "#{
|
11
11
|
FIXTURE_PATH}some_missing_path.docx" }
|
12
12
|
|
13
|
-
|
13
|
+
describe 'full utf-8 encoding' do
|
14
|
+
let(:result) { DocRipper.rip(file_path) }
|
15
|
+
|
16
|
+
context 'txt file' do
|
17
|
+
let(:str) { '¿Cuál es su nombre?' }
|
18
|
+
let(:file_path) { "#{FIXTURE_PATH}encoding_sample.txt" }
|
19
|
+
|
20
|
+
before(:each) do
|
21
|
+
File.write(file_path, str)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'maintains encoding' do
|
25
|
+
expect(result).to eq(str)
|
26
|
+
end
|
27
|
+
|
28
|
+
after(:each) do
|
29
|
+
File.delete(file_path)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe 'docx file' do
|
34
|
+
let(:str) { '四、我们确认,我们完全同意招标文件制定的投标规则,并承诺按照这些规则履行我们的所有义务,包括一旦投标文件被贵方接受,将履行社会资本合作方的义务' }
|
35
|
+
let(:file_path) { "#{FIXTURE_PATH}chinese.docx" }
|
14
36
|
|
37
|
+
it 'maintains encoding' do
|
38
|
+
puts result.encoding
|
39
|
+
expect(result).to include(str)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context '#rip' do
|
15
45
|
it 'should respond to #rip' do
|
16
46
|
expect(DocRipper.respond_to? :rip).to eq(true)
|
17
47
|
end
|
@@ -34,7 +64,6 @@ module DocRipper
|
|
34
64
|
it 'should remove the dumped text version of the file' do
|
35
65
|
|
36
66
|
end
|
37
|
-
|
38
67
|
end
|
39
68
|
|
40
69
|
context '#rip!' do
|
@@ -51,7 +80,5 @@ module DocRipper
|
|
51
80
|
end
|
52
81
|
|
53
82
|
end
|
54
|
-
|
55
|
-
|
56
83
|
end
|
57
|
-
end
|
84
|
+
end
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod
|
3
|
+
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim
|
4
|
+
veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea
|
5
|
+
commodo consequat. Duis aute irure dolor in reprehenderit in voluptate
|
6
|
+
velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat
|
7
|
+
cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id
|
8
|
+
est laborum.
|
9
|
+
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_ripper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Zaich
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: CFPropertyList
|
@@ -80,6 +80,48 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: sqlite3
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: pry
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
83
125
|
description: Scrape text from common file formats (.pdf,.doc,.docx, .sketch, .txt)
|
84
126
|
with a single convenient command.
|
85
127
|
email:
|
@@ -102,20 +144,21 @@ files:
|
|
102
144
|
- lib/doc_ripper/formats/ms_doc_ripper.rb
|
103
145
|
- lib/doc_ripper/formats/pdf_ripper.rb
|
104
146
|
- lib/doc_ripper/formats/sketch_ripper.rb
|
105
|
-
- lib/doc_ripper/text_ripper.rb
|
147
|
+
- lib/doc_ripper/formats/text_ripper.rb
|
106
148
|
- lib/doc_ripper/version.rb
|
107
149
|
- pkg/doc_ripper-0.0.5.gem
|
108
150
|
- pkg/doc_ripper-0.0.6.gem
|
109
151
|
- pkg/doc_ripper-0.0.7.1.gem
|
152
|
+
- pkg/doc_ripper-0.0.7.2.gem
|
110
153
|
- pkg/doc_ripper-0.0.7.gem
|
111
|
-
- spec/doc_ripper/
|
112
|
-
- spec/doc_ripper/formats/doc_ripper_spec.rb
|
154
|
+
- spec/doc_ripper/doc_ripper_spec.rb
|
113
155
|
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
114
|
-
- spec/
|
156
|
+
- spec/fixtures/chinese.docx
|
115
157
|
- spec/fixtures/complex_sketch_text.sketch
|
116
158
|
- spec/fixtures/lorem.doc
|
117
159
|
- spec/fixtures/lorem.docx
|
118
160
|
- spec/fixtures/lorem.pdf
|
161
|
+
- spec/fixtures/lorem.txt
|
119
162
|
- spec/fixtures/missing_file.txt
|
120
163
|
- spec/fixtures/simple_sketch_text.sketch
|
121
164
|
- spec/fixtures/some_missing_path.txt
|
@@ -142,19 +185,19 @@ requirements:
|
|
142
185
|
- Antiword
|
143
186
|
- pdftotext/poppler
|
144
187
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.6.
|
188
|
+
rubygems_version: 2.6.14
|
146
189
|
signing_key:
|
147
190
|
specification_version: 4
|
148
191
|
summary: Rip out text from pdf, doc and docx formats
|
149
192
|
test_files:
|
150
|
-
- spec/doc_ripper/
|
151
|
-
- spec/doc_ripper/formats/doc_ripper_spec.rb
|
193
|
+
- spec/doc_ripper/doc_ripper_spec.rb
|
152
194
|
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
153
|
-
- spec/
|
195
|
+
- spec/fixtures/chinese.docx
|
154
196
|
- spec/fixtures/complex_sketch_text.sketch
|
155
197
|
- spec/fixtures/lorem.doc
|
156
198
|
- spec/fixtures/lorem.docx
|
157
199
|
- spec/fixtures/lorem.pdf
|
200
|
+
- spec/fixtures/lorem.txt
|
158
201
|
- spec/fixtures/missing_file.txt
|
159
202
|
- spec/fixtures/simple_sketch_text.sketch
|
160
203
|
- spec/fixtures/some_missing_path.txt
|
@@ -1,43 +0,0 @@
|
|
1
|
-
#encoding: UTF-8
|
2
|
-
|
3
|
-
module DocRipper
|
4
|
-
class TextRipper < Ripper::Base
|
5
|
-
attr_reader :text_file_path, :file_path
|
6
|
-
|
7
|
-
def ripped?
|
8
|
-
@is_ripped ||=choose_ripper
|
9
|
-
end
|
10
|
-
|
11
|
-
def text
|
12
|
-
if ripped? && @ripper.read_type == :file
|
13
|
-
@text = IO.read(@text_file_path).force_encoding("ISO-8859-1").encode("utf-8", replace: nil)
|
14
|
-
File.delete(@text_file_path)
|
15
|
-
|
16
|
-
elsif ripped? && @ripper.read_type == :mem
|
17
|
-
@text = @ripper.text
|
18
|
-
end
|
19
|
-
|
20
|
-
@text
|
21
|
-
end
|
22
|
-
|
23
|
-
private
|
24
|
-
|
25
|
-
def choose_ripper
|
26
|
-
case
|
27
|
-
when !!(@file_path =~ /.docx$/i)
|
28
|
-
@ripper = Formats::DocxRipper.new(@file_path)
|
29
|
-
when !!(@file_path =~ /.doc$/i)
|
30
|
-
@ripper = Formats::MsDocRipper.new(@file_path)
|
31
|
-
when !!(@file_path =~ /.pdf$/i)
|
32
|
-
@ripper = Formats::PdfRipper.new(@file_path)
|
33
|
-
when !!(@file_path =~ /.sketch$/i)
|
34
|
-
@ripper = Formats::SketchRipper.new(@file_path)
|
35
|
-
when @options[:raise]
|
36
|
-
raise UnsupportedFileType
|
37
|
-
end
|
38
|
-
|
39
|
-
@ripper.rip
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
end
|