doc_ripper 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -2
- data/doc_ripper.gemspec +4 -0
- data/lib/doc_ripper.rb +8 -9
- data/lib/doc_ripper/{ripper/base.rb → base.rb} +4 -0
- data/lib/doc_ripper/formats/docx_ripper.rb +11 -0
- data/lib/doc_ripper/formats/ms_doc_ripper.rb +11 -0
- data/lib/doc_ripper/formats/pdf_ripper.rb +11 -0
- data/lib/doc_ripper/formats/sketch_ripper.rb +84 -0
- data/lib/doc_ripper/text_ripper.rb +20 -8
- data/lib/doc_ripper/version.rb +1 -1
- data/pkg/doc_ripper-0.0.5.gem +0 -0
- data/spec/doc_ripper/{ripper/base_spec.rb → base_spec.rb} +0 -0
- data/spec/doc_ripper/{doc_ripper_spec.rb → formats/doc_ripper_spec.rb} +0 -0
- data/spec/doc_ripper/formats/sketch_ripper_spec.rb +29 -0
- data/spec/doc_ripper/{text_ripper_spec.rb → formats/text_ripper_spec.rb} +0 -0
- data/spec/fixtures/complex_sketch_text.sketch +0 -0
- data/spec/fixtures/simple_sketch_text.sketch +0 -0
- metadata +62 -14
- data/lib/doc_ripper/docx_ripper.rb +0 -9
- data/lib/doc_ripper/ms_doc_ripper.rb +0 -9
- data/lib/doc_ripper/pdf_ripper.rb +0 -9
- data/spec/fixtures/lorem.txt +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a97f6b37326f9f22afd538cd95a86da46caf5c48
|
4
|
+
data.tar.gz: 5ff485ab583cacfec99c9dcff702296d0cb5d1bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c450f92c3a65d8c2bf0ec167eeadaebcfa1ddbb3d0f699d1577bc112e0c98a8b77dfc8d49dac3f20073726b42b6c7788ff8b128b6b444fe39a4ec336f42f0b5a
|
7
|
+
data.tar.gz: c9e1cc600b2e43a6f6f3551092d674b0eefcebf2983874d79445d50631289747cf0285481caa0252cbc0e0e1dab16c14a221098197577864d172479ab4f07e06
|
data/README.md
CHANGED
@@ -1,11 +1,29 @@
|
|
1
1
|
# DocRipper
|
2
|
+
[![Gem Version](https://badge.fury.io/rb/doc_ripper.svg)](http://badge.fury.io/rb/doc_ripper)
|
2
3
|
|
3
|
-
Grab the text from common document formats with 1 command. DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
|
4
|
+
Grab the text from common document formats with 1 command. DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf, .sketch) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
|
4
5
|
|
5
6
|
For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion.
|
6
7
|
|
7
8
|
Need OCR support or in-image text parsing? Take a look at [Docsplit](https://github.com/documentcloud/docsplit).
|
8
9
|
|
10
|
+
### Supported File Formats
|
11
|
+
````
|
12
|
+
.doc
|
13
|
+
.docx
|
14
|
+
.pdf
|
15
|
+
.txt
|
16
|
+
.sketch
|
17
|
+
````
|
18
|
+
|
19
|
+
File format | Supported? | Dependencies
|
20
|
+
------------|------------|-------------
|
21
|
+
.doc | x | Antiword
|
22
|
+
.docx | x |
|
23
|
+
.pdf | x | Poppler-utils
|
24
|
+
.txt | x |
|
25
|
+
.sketch | x |
|
26
|
+
|
9
27
|
## Quickstart
|
10
28
|
|
11
29
|
```
|
@@ -27,7 +45,7 @@ Need OCR support or in-image text parsing? Take a look at [Docsplit](https://git
|
|
27
45
|
```
|
28
46
|
|
29
47
|
#### Want to raise an exception? Use #rip!
|
30
|
-
|
48
|
+
\#rip! will raise an exception if rip returns nil or the file type isn't supported
|
31
49
|
|
32
50
|
```
|
33
51
|
# invalid file type
|
data/doc_ripper.gemspec
CHANGED
@@ -21,6 +21,10 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.requirements << 'Antiword'
|
22
22
|
spec.requirements << "pdftotext/poppler"
|
23
23
|
|
24
|
+
spec.add_dependency "sqlite3", "~> 1.3.11"
|
25
|
+
spec.add_dependency "activesupport", "~> 4.2.6"
|
26
|
+
spec.add_dependency "CFPropertyList", '~> 2.3'
|
27
|
+
|
24
28
|
spec.add_development_dependency "bundler", "~> 1.6"
|
25
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
26
30
|
spec.add_development_dependency "rspec"
|
data/lib/doc_ripper.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
require 'shellwords'
|
2
|
+
require "sqlite3"
|
2
3
|
require "doc_ripper/version"
|
3
|
-
require "doc_ripper/
|
4
|
+
require "doc_ripper/base"
|
4
5
|
require "doc_ripper/text_ripper"
|
5
|
-
require "doc_ripper/pdf_ripper"
|
6
|
-
require "doc_ripper/docx_ripper"
|
7
|
-
require "doc_ripper/ms_doc_ripper"
|
6
|
+
require "doc_ripper/formats/pdf_ripper"
|
7
|
+
require "doc_ripper/formats/docx_ripper"
|
8
|
+
require "doc_ripper/formats/ms_doc_ripper"
|
9
|
+
require "doc_ripper/formats/sketch_ripper"
|
8
10
|
require "doc_ripper/exceptions"
|
9
11
|
|
10
12
|
module DocRipper
|
@@ -15,11 +17,8 @@ module DocRipper
|
|
15
17
|
|
16
18
|
def rip!(path)
|
17
19
|
text = rip(path, raise: true)
|
18
|
-
|
19
|
-
|
20
|
-
else
|
21
|
-
raise FileNotFound
|
22
|
-
end
|
20
|
+
|
21
|
+
text || raise(FileNotFound)
|
23
22
|
end
|
24
23
|
end
|
25
24
|
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module DocRipper
|
2
|
+
module Formats
|
3
|
+
class DocxRipper < Ripper::Base
|
4
|
+
|
5
|
+
def rip
|
6
|
+
@text ||= system(%Q[ unzip -p #{to_shell(@file_path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$' > #{to_shell(@text_file_path)} ])
|
7
|
+
end
|
8
|
+
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/core_ext'
|
3
|
+
require 'cfpropertylist'
|
4
|
+
|
5
|
+
module DocRipper
|
6
|
+
module Formats
|
7
|
+
class SketchRipper < Ripper::Base
|
8
|
+
|
9
|
+
class CFPropertyList::CFString
|
10
|
+
def to_s
|
11
|
+
value
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class CFPropertyList::CFType
|
16
|
+
def blacklisted_class?
|
17
|
+
return false if !self.value.respond_to?(:[])
|
18
|
+
|
19
|
+
klass = self.value['$class']
|
20
|
+
|
21
|
+
# Sketch Internal ID References
|
22
|
+
# 39 = rectangle / artboard / page / group
|
23
|
+
# 170 = font definition
|
24
|
+
|
25
|
+
return false if !klass
|
26
|
+
[170].include?(klass.value)
|
27
|
+
end
|
28
|
+
|
29
|
+
def sketch_page?
|
30
|
+
return false if !self.value.respond_to?(:[])
|
31
|
+
klass = self.value['$classes']
|
32
|
+
|
33
|
+
return false if !klass
|
34
|
+
klass.is_a?(CFPropertyList::CFArray)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def read_type
|
39
|
+
:mem
|
40
|
+
end
|
41
|
+
|
42
|
+
def rip
|
43
|
+
db = SQLite3::Database.new(@file_path)
|
44
|
+
data = db.execute("SELECT value FROM payload").flatten.first
|
45
|
+
@text ||= text_objects(data).join(" ").strip
|
46
|
+
end
|
47
|
+
|
48
|
+
def blacklist
|
49
|
+
%w(\$null MSAttributedStringFontAttribute NSColor NSParagraphStyle)
|
50
|
+
end
|
51
|
+
|
52
|
+
def text_objects(data)
|
53
|
+
objects = CFPropertyList::List.new(data: data).value.value['$objects'].value
|
54
|
+
|
55
|
+
evaluator = Proc.new do |object, previous_object, n_2_previous_object, next_object|
|
56
|
+
coordinatesRegex = /\{\{\d*, \d*}, \{\d*, \d*\}\}|\{[\d.e-]*, [\d.]*\}/
|
57
|
+
|
58
|
+
object.is_a?(CFPropertyList::CFString) &&
|
59
|
+
#ignore other blacklisted properties
|
60
|
+
blacklist.select { |bl| object.value.match(/#{bl}/) }.empty? &&
|
61
|
+
#ignore uuids
|
62
|
+
!object.value.match(/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}/) &&
|
63
|
+
#ignore coordinates
|
64
|
+
!object.value.match(coordinatesRegex) &&
|
65
|
+
#ignore font definitions
|
66
|
+
previous_object.value != "NSFontNameAttribute" &&
|
67
|
+
# labels always have an dictionary defined afterwards
|
68
|
+
next_object.is_a?(CFPropertyList::CFDictionary) &&
|
69
|
+
# Check if the string is defining the name of an artboard or font
|
70
|
+
!(previous_object.respond_to?(:blacklisted_class?) && previous_object.blacklisted_class?) &&
|
71
|
+
!(n_2_previous_object.respond_to?(:blacklisted_class?) && n_2_previous_object.blacklisted_class?)
|
72
|
+
end
|
73
|
+
|
74
|
+
objects.select.with_index do |object,i|
|
75
|
+
next_object = objects[i+1]
|
76
|
+
previous_object = objects[i-1]
|
77
|
+
n_2_previous_object = objects[i-2]
|
78
|
+
|
79
|
+
evaluator.call(object, previous_object, n_2_previous_object, next_object)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -4,27 +4,39 @@ module DocRipper
|
|
4
4
|
class TextRipper < Ripper::Base
|
5
5
|
attr_reader :text_file_path, :file_path
|
6
6
|
|
7
|
-
def
|
7
|
+
def ripped?
|
8
8
|
@is_ripped ||=choose_ripper
|
9
9
|
end
|
10
10
|
|
11
11
|
def text
|
12
|
-
|
12
|
+
if ripped? && @ripper.read_type == :file
|
13
|
+
@text = IO.read(@text_file_path).force_encoding("ISO-8859-1").encode("utf-8", replace: nil)
|
14
|
+
File.delete(@text_file_path)
|
15
|
+
|
16
|
+
elsif ripped? && @ripper.read_type == :mem
|
17
|
+
@text = @ripper.text
|
18
|
+
end
|
19
|
+
|
20
|
+
@text
|
13
21
|
end
|
14
22
|
|
15
23
|
private
|
16
24
|
|
17
25
|
def choose_ripper
|
18
26
|
case
|
19
|
-
when !!(@file_path
|
20
|
-
DocxRipper.new(@file_path)
|
21
|
-
when !!(@file_path
|
22
|
-
MsDocRipper.new(@file_path)
|
23
|
-
when !!(@file_path
|
24
|
-
PdfRipper.new(@file_path)
|
27
|
+
when !!(@file_path =~ /.docx$/i)
|
28
|
+
@ripper = Formats::DocxRipper.new(@file_path)
|
29
|
+
when !!(@file_path =~ /.doc$/i)
|
30
|
+
@ripper = Formats::MsDocRipper.new(@file_path)
|
31
|
+
when !!(@file_path =~ /.pdf$/i)
|
32
|
+
@ripper = Formats::PdfRipper.new(@file_path)
|
33
|
+
when !!(@file_path =~ /.sketch$/i)
|
34
|
+
@ripper = Formats::SketchRipper.new(@file_path)
|
25
35
|
when @options[:raise]
|
26
36
|
raise UnsupportedFileType
|
27
37
|
end
|
38
|
+
|
39
|
+
@ripper.rip
|
28
40
|
end
|
29
41
|
|
30
42
|
end
|
data/lib/doc_ripper/version.rb
CHANGED
Binary file
|
File without changes
|
File without changes
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module DocRipper
|
4
|
+
describe 'SketchRipper' do
|
5
|
+
let(:simple_sketch_path) { "#{FIXTURE_PATH}simple_sketch_text.sketch" }
|
6
|
+
let(:simple_sketch_text) { "Page 1 t Grab some text Grab some text t copy" }
|
7
|
+
let(:complex_sketch_path) { "#{FIXTURE_PATH}complex_sketch_text.sketch" }
|
8
|
+
let(:complex_sketch_text) do
|
9
|
+
"Page 1 Onboarding Wizard -- Step 3 Header Rectangle 20 Path UtilityZen UtilityZen Line notification-icons---download-for-free-at-icons8 Shape gear-icons---download-for-free-at-icons8 Rectangle 293 Sync the accounts us Sync the accounts used by 484 Sexton. You\u2019ll be asked to approve access so that we can begin monitoring home usage. Don\u2019t see one of the Don\u2019t see one of the utilities your home uses? Let us know. 2/2 Your Accounts 2/2 Your Accounts Group Rectangle 294 Next step Utility Full Chit Gas + Power Utility Rectangle 279 Pacific_Gas_and_Electric_Company_(logo) Layer_1 g2105 g2107 g2109 path2111 path2111-path g2113 path2115 path2115-path g2117 path2119 path2119-path g2121 path2123 path2123-path g2125 path2127 path2127-path g2129 path2131 path2131-path g2133 path2135 path2135-path path2135-path path2137 path2137-path path2137-path Utility Full Chit Gas + Power Rectangle 279 sfpuc-logo-vert"
|
10
|
+
end
|
11
|
+
|
12
|
+
describe '#rip' do
|
13
|
+
|
14
|
+
let(:ripper) { DocRipper.rip(simple_sketch_path) }
|
15
|
+
|
16
|
+
it 'returns all text labels, layer names and page names from Sketch documents' do
|
17
|
+
expect(ripper).to eq(simple_sketch_text)
|
18
|
+
end
|
19
|
+
|
20
|
+
context 'complex sketch example' do
|
21
|
+
let(:ripper) { DocRipper.rip(complex_sketch_path) }
|
22
|
+
|
23
|
+
it 'returns matching text from labels' do
|
24
|
+
expect(ripper.split(' ')).to match_array(complex_sketch_text.split(' '))
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
File without changes
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,15 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_ripper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Zaich
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-07-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: sqlite3
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.3.11
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.3.11
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: activesupport
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 4.2.6
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 4.2.6
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: CFPropertyList
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.3'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.3'
|
13
55
|
- !ruby/object:Gem::Dependency
|
14
56
|
name: bundler
|
15
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -68,21 +110,25 @@ files:
|
|
68
110
|
- Rakefile
|
69
111
|
- doc_ripper.gemspec
|
70
112
|
- lib/doc_ripper.rb
|
71
|
-
- lib/doc_ripper/
|
113
|
+
- lib/doc_ripper/base.rb
|
72
114
|
- lib/doc_ripper/exceptions.rb
|
73
|
-
- lib/doc_ripper/
|
74
|
-
- lib/doc_ripper/
|
75
|
-
- lib/doc_ripper/
|
115
|
+
- lib/doc_ripper/formats/docx_ripper.rb
|
116
|
+
- lib/doc_ripper/formats/ms_doc_ripper.rb
|
117
|
+
- lib/doc_ripper/formats/pdf_ripper.rb
|
118
|
+
- lib/doc_ripper/formats/sketch_ripper.rb
|
76
119
|
- lib/doc_ripper/text_ripper.rb
|
77
120
|
- lib/doc_ripper/version.rb
|
78
|
-
-
|
79
|
-
- spec/doc_ripper/
|
80
|
-
- spec/doc_ripper/
|
121
|
+
- pkg/doc_ripper-0.0.5.gem
|
122
|
+
- spec/doc_ripper/base_spec.rb
|
123
|
+
- spec/doc_ripper/formats/doc_ripper_spec.rb
|
124
|
+
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
125
|
+
- spec/doc_ripper/formats/text_ripper_spec.rb
|
126
|
+
- spec/fixtures/complex_sketch_text.sketch
|
81
127
|
- spec/fixtures/lorem.doc
|
82
128
|
- spec/fixtures/lorem.docx
|
83
129
|
- spec/fixtures/lorem.pdf
|
84
|
-
- spec/fixtures/lorem.txt
|
85
130
|
- spec/fixtures/missing_file.txt
|
131
|
+
- spec/fixtures/simple_sketch_text.sketch
|
86
132
|
- spec/fixtures/some_missing_path.txt
|
87
133
|
- spec/spec_helper.rb
|
88
134
|
homepage: https://github.com/pzaich/doc_ripper
|
@@ -112,13 +158,15 @@ signing_key:
|
|
112
158
|
specification_version: 4
|
113
159
|
summary: Rip out text from pdf, doc and docx formats
|
114
160
|
test_files:
|
115
|
-
- spec/doc_ripper/
|
116
|
-
- spec/doc_ripper/
|
117
|
-
- spec/doc_ripper/
|
161
|
+
- spec/doc_ripper/base_spec.rb
|
162
|
+
- spec/doc_ripper/formats/doc_ripper_spec.rb
|
163
|
+
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
164
|
+
- spec/doc_ripper/formats/text_ripper_spec.rb
|
165
|
+
- spec/fixtures/complex_sketch_text.sketch
|
118
166
|
- spec/fixtures/lorem.doc
|
119
167
|
- spec/fixtures/lorem.docx
|
120
168
|
- spec/fixtures/lorem.pdf
|
121
|
-
- spec/fixtures/lorem.txt
|
122
169
|
- spec/fixtures/missing_file.txt
|
170
|
+
- spec/fixtures/simple_sketch_text.sketch
|
123
171
|
- spec/fixtures/some_missing_path.txt
|
124
172
|
- spec/spec_helper.rb
|
data/spec/fixtures/lorem.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|