doc_ripper 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/doc_ripper/formats/pdf_ripper.rb +1 -1
- data/lib/doc_ripper/version.rb +1 -1
- data/spec/doc_ripper/formats/pdf_ripper_spec.rb +17 -0
- data/spec/fixtures/test_pdf.pdf +198 -0
- metadata +7 -8
- data/pkg/doc_ripper-0.0.5.gem +0 -0
- data/pkg/doc_ripper-0.0.6.gem +0 -0
- data/pkg/doc_ripper-0.0.7.1.gem +0 -0
- data/pkg/doc_ripper-0.0.7.2.gem +0 -0
- data/pkg/doc_ripper-0.0.7.gem +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 42889a5e0fcb743f56d335c897498f1a20f2256e
|
4
|
+
data.tar.gz: bf28748767b46ca425aa43a593d05abe01978dc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa27ab39d9ec18416b81adf7fe2d6d1d4cccd88460a94c557d870a5069bfd999f2c2b937e690be5d8db4ed034804c8b2582bbf299e484f57f992f7cc970312c0
|
7
|
+
data.tar.gz: e528e8a0310b2096c6e0ea2e1f2889da2d3513ad3d4b1ab033ad2d89f33ddeb0ff6ea6bee14a148c7cdc32ae993fe6ea89dc0d4e327d81ba6fce9ef11e0d7db4
|
data/lib/doc_ripper/version.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module DocRipper
|
4
|
+
describe "PdfRipper" do
|
5
|
+
let(:test_pdf_path) { "#{FIXTURE_PATH}test_pdf.pdf" }
|
6
|
+
let(:test_pdf_text) { "A Simple PDF File\nThis is a small demonstration .pdf file just for use in the Virtual Mechanics tutorials. More text. And more\ntext. And more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. Boring, zzzzz. And more text. And more text. And\nmore text. And more text. And more text. And more text. And more text.\nAnd more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. And more text. Even more. Continued on page 2 ...\n\n\fSimple PDF File 2\n...continued from page 1. Yet more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. Oh, how boring typing this stuff. But not as boring as watching\npaint dry. And more text. And more text. And more text. And more text.\nBoring. More, a little more text. The end, and just as well.\n\n\f" }
|
7
|
+
|
8
|
+
describe "#rip" do
|
9
|
+
|
10
|
+
let(:ripper) { DocRipper.rip(test_pdf_path) }
|
11
|
+
|
12
|
+
it "returns correct text from pdf" do
|
13
|
+
expect(ripper.gsub(/\s/, " ")).to eq(test_pdf_text.gsub(/\s/, " "))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,198 @@
|
|
1
|
+
%PDF-1.3
|
2
|
+
%����
|
3
|
+
|
4
|
+
1 0 obj
|
5
|
+
<<
|
6
|
+
/Type /Catalog
|
7
|
+
/Outlines 2 0 R
|
8
|
+
/Pages 3 0 R
|
9
|
+
>>
|
10
|
+
endobj
|
11
|
+
|
12
|
+
2 0 obj
|
13
|
+
<<
|
14
|
+
/Type /Outlines
|
15
|
+
/Count 0
|
16
|
+
>>
|
17
|
+
endobj
|
18
|
+
|
19
|
+
3 0 obj
|
20
|
+
<<
|
21
|
+
/Type /Pages
|
22
|
+
/Count 2
|
23
|
+
/Kids [ 4 0 R 6 0 R ]
|
24
|
+
>>
|
25
|
+
endobj
|
26
|
+
|
27
|
+
4 0 obj
|
28
|
+
<<
|
29
|
+
/Type /Page
|
30
|
+
/Parent 3 0 R
|
31
|
+
/Resources <<
|
32
|
+
/Font <<
|
33
|
+
/F1 9 0 R
|
34
|
+
>>
|
35
|
+
/ProcSet 8 0 R
|
36
|
+
>>
|
37
|
+
/MediaBox [0 0 612.0000 792.0000]
|
38
|
+
/Contents 5 0 R
|
39
|
+
>>
|
40
|
+
endobj
|
41
|
+
|
42
|
+
5 0 obj
|
43
|
+
<< /Length 1074 >>
|
44
|
+
stream
|
45
|
+
2 J
|
46
|
+
BT
|
47
|
+
0 0 0 rg
|
48
|
+
/F1 0027 Tf
|
49
|
+
57.3750 722.2800 Td
|
50
|
+
( A Simple PDF File ) Tj
|
51
|
+
ET
|
52
|
+
BT
|
53
|
+
/F1 0010 Tf
|
54
|
+
69.2500 688.6080 Td
|
55
|
+
( This is a small demonstration .pdf file - ) Tj
|
56
|
+
ET
|
57
|
+
BT
|
58
|
+
/F1 0010 Tf
|
59
|
+
69.2500 664.7040 Td
|
60
|
+
( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
|
61
|
+
ET
|
62
|
+
BT
|
63
|
+
/F1 0010 Tf
|
64
|
+
69.2500 652.7520 Td
|
65
|
+
( text. And more text. And more text. And more text. ) Tj
|
66
|
+
ET
|
67
|
+
BT
|
68
|
+
/F1 0010 Tf
|
69
|
+
69.2500 628.8480 Td
|
70
|
+
( And more text. And more text. And more text. And more text. And more ) Tj
|
71
|
+
ET
|
72
|
+
BT
|
73
|
+
/F1 0010 Tf
|
74
|
+
69.2500 616.8960 Td
|
75
|
+
( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
|
76
|
+
ET
|
77
|
+
BT
|
78
|
+
/F1 0010 Tf
|
79
|
+
69.2500 604.9440 Td
|
80
|
+
( more text. And more text. And more text. And more text. And more text. ) Tj
|
81
|
+
ET
|
82
|
+
BT
|
83
|
+
/F1 0010 Tf
|
84
|
+
69.2500 592.9920 Td
|
85
|
+
( And more text. And more text. ) Tj
|
86
|
+
ET
|
87
|
+
BT
|
88
|
+
/F1 0010 Tf
|
89
|
+
69.2500 569.0880 Td
|
90
|
+
( And more text. And more text. And more text. And more text. And more ) Tj
|
91
|
+
ET
|
92
|
+
BT
|
93
|
+
/F1 0010 Tf
|
94
|
+
69.2500 557.1360 Td
|
95
|
+
( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
|
96
|
+
ET
|
97
|
+
endstream
|
98
|
+
endobj
|
99
|
+
|
100
|
+
6 0 obj
|
101
|
+
<<
|
102
|
+
/Type /Page
|
103
|
+
/Parent 3 0 R
|
104
|
+
/Resources <<
|
105
|
+
/Font <<
|
106
|
+
/F1 9 0 R
|
107
|
+
>>
|
108
|
+
/ProcSet 8 0 R
|
109
|
+
>>
|
110
|
+
/MediaBox [0 0 612.0000 792.0000]
|
111
|
+
/Contents 7 0 R
|
112
|
+
>>
|
113
|
+
endobj
|
114
|
+
|
115
|
+
7 0 obj
|
116
|
+
<< /Length 676 >>
|
117
|
+
stream
|
118
|
+
2 J
|
119
|
+
BT
|
120
|
+
0 0 0 rg
|
121
|
+
/F1 0027 Tf
|
122
|
+
57.3750 722.2800 Td
|
123
|
+
( Simple PDF File 2 ) Tj
|
124
|
+
ET
|
125
|
+
BT
|
126
|
+
/F1 0010 Tf
|
127
|
+
69.2500 688.6080 Td
|
128
|
+
( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
|
129
|
+
ET
|
130
|
+
BT
|
131
|
+
/F1 0010 Tf
|
132
|
+
69.2500 676.6560 Td
|
133
|
+
( And more text. And more text. And more text. And more text. And more ) Tj
|
134
|
+
ET
|
135
|
+
BT
|
136
|
+
/F1 0010 Tf
|
137
|
+
69.2500 664.7040 Td
|
138
|
+
( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
|
139
|
+
ET
|
140
|
+
BT
|
141
|
+
/F1 0010 Tf
|
142
|
+
69.2500 652.7520 Td
|
143
|
+
( paint dry. And more text. And more text. And more text. And more text. ) Tj
|
144
|
+
ET
|
145
|
+
BT
|
146
|
+
/F1 0010 Tf
|
147
|
+
69.2500 640.8000 Td
|
148
|
+
( Boring. More, a little more text. The end, and just as well. ) Tj
|
149
|
+
ET
|
150
|
+
endstream
|
151
|
+
endobj
|
152
|
+
|
153
|
+
8 0 obj
|
154
|
+
[/PDF /Text]
|
155
|
+
endobj
|
156
|
+
|
157
|
+
9 0 obj
|
158
|
+
<<
|
159
|
+
/Type /Font
|
160
|
+
/Subtype /Type1
|
161
|
+
/Name /F1
|
162
|
+
/BaseFont /Helvetica
|
163
|
+
/Encoding /WinAnsiEncoding
|
164
|
+
>>
|
165
|
+
endobj
|
166
|
+
|
167
|
+
10 0 obj
|
168
|
+
<<
|
169
|
+
/Creator (Rave \(http://www.nevrona.com/rave\))
|
170
|
+
/Producer (Nevrona Designs)
|
171
|
+
/CreationDate (D:20060301072826)
|
172
|
+
>>
|
173
|
+
endobj
|
174
|
+
|
175
|
+
xref
|
176
|
+
0 11
|
177
|
+
0000000000 65535 f
|
178
|
+
0000000019 00000 n
|
179
|
+
0000000093 00000 n
|
180
|
+
0000000147 00000 n
|
181
|
+
0000000222 00000 n
|
182
|
+
0000000390 00000 n
|
183
|
+
0000001522 00000 n
|
184
|
+
0000001690 00000 n
|
185
|
+
0000002423 00000 n
|
186
|
+
0000002456 00000 n
|
187
|
+
0000002574 00000 n
|
188
|
+
|
189
|
+
trailer
|
190
|
+
<<
|
191
|
+
/Size 11
|
192
|
+
/Root 1 0 R
|
193
|
+
/Info 10 0 R
|
194
|
+
>>
|
195
|
+
|
196
|
+
startxref
|
197
|
+
2714
|
198
|
+
%%EOF
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_ripper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Zaich
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: CFPropertyList
|
@@ -146,12 +146,8 @@ files:
|
|
146
146
|
- lib/doc_ripper/formats/sketch_ripper.rb
|
147
147
|
- lib/doc_ripper/formats/text_ripper.rb
|
148
148
|
- lib/doc_ripper/version.rb
|
149
|
-
- pkg/doc_ripper-0.0.5.gem
|
150
|
-
- pkg/doc_ripper-0.0.6.gem
|
151
|
-
- pkg/doc_ripper-0.0.7.1.gem
|
152
|
-
- pkg/doc_ripper-0.0.7.2.gem
|
153
|
-
- pkg/doc_ripper-0.0.7.gem
|
154
149
|
- spec/doc_ripper/doc_ripper_spec.rb
|
150
|
+
- spec/doc_ripper/formats/pdf_ripper_spec.rb
|
155
151
|
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
156
152
|
- spec/fixtures/chinese.docx
|
157
153
|
- spec/fixtures/complex_sketch_text.sketch
|
@@ -162,6 +158,7 @@ files:
|
|
162
158
|
- spec/fixtures/missing_file.txt
|
163
159
|
- spec/fixtures/simple_sketch_text.sketch
|
164
160
|
- spec/fixtures/some_missing_path.txt
|
161
|
+
- spec/fixtures/test_pdf.pdf
|
165
162
|
- spec/spec_helper.rb
|
166
163
|
homepage: https://github.com/pzaich/doc_ripper
|
167
164
|
licenses:
|
@@ -185,12 +182,13 @@ requirements:
|
|
185
182
|
- Antiword
|
186
183
|
- pdftotext/poppler
|
187
184
|
rubyforge_project:
|
188
|
-
rubygems_version: 2.6.14
|
185
|
+
rubygems_version: 2.6.14.1
|
189
186
|
signing_key:
|
190
187
|
specification_version: 4
|
191
188
|
summary: Rip out text from pdf, doc and docx formats
|
192
189
|
test_files:
|
193
190
|
- spec/doc_ripper/doc_ripper_spec.rb
|
191
|
+
- spec/doc_ripper/formats/pdf_ripper_spec.rb
|
194
192
|
- spec/doc_ripper/formats/sketch_ripper_spec.rb
|
195
193
|
- spec/fixtures/chinese.docx
|
196
194
|
- spec/fixtures/complex_sketch_text.sketch
|
@@ -201,4 +199,5 @@ test_files:
|
|
201
199
|
- spec/fixtures/missing_file.txt
|
202
200
|
- spec/fixtures/simple_sketch_text.sketch
|
203
201
|
- spec/fixtures/some_missing_path.txt
|
202
|
+
- spec/fixtures/test_pdf.pdf
|
204
203
|
- spec/spec_helper.rb
|
data/pkg/doc_ripper-0.0.5.gem
DELETED
Binary file
|
data/pkg/doc_ripper-0.0.6.gem
DELETED
Binary file
|
data/pkg/doc_ripper-0.0.7.1.gem
DELETED
Binary file
|
data/pkg/doc_ripper-0.0.7.2.gem
DELETED
Binary file
|
data/pkg/doc_ripper-0.0.7.gem
DELETED
Binary file
|