doc_ripper 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 48d68fcd85b60d3ab64df91710c31158a998dc1a
4
- data.tar.gz: 0aba15d63c943c4f33f5dbe14503907a72744a07
3
+ metadata.gz: 42889a5e0fcb743f56d335c897498f1a20f2256e
4
+ data.tar.gz: bf28748767b46ca425aa43a593d05abe01978dc0
5
5
  SHA512:
6
- metadata.gz: 104082f0efdf157273abda2e201b55cb315fe861005e4a14d358a09297a2f2b7b5d98e04fdd371196f85c6325f841ad82932bb33c123cc9432b35e9c5f45d572
7
- data.tar.gz: 2f1b2d0c81f45ed03e197b0a0a2eaa41a057629f8c5c32f2107dc09233e67f12a141e5a19fa53ab8e97dd434e66f7fee18cddf1aa6c81bf600e974c9440383de
6
+ metadata.gz: aa27ab39d9ec18416b81adf7fe2d6d1d4cccd88460a94c557d870a5069bfd999f2c2b937e690be5d8db4ed034804c8b2582bbf299e484f57f992f7cc970312c0
7
+ data.tar.gz: e528e8a0310b2096c6e0ea2e1f2889da2d3513ad3d4b1ab033ad2d89f33ddeb0ff6ea6bee14a148c7cdc32ae993fe6ea89dc0d4e327d81ba6fce9ef11e0d7db4
@@ -3,7 +3,7 @@ module DocRipper
3
3
  class PdfRipper < Ripper::Base
4
4
 
5
5
  def rip
6
- @text ||= %x(pdftotext #{to_shell(file_path)})
6
+ @text ||= %x(pdftotext #{to_shell(file_path)} -)
7
7
  end
8
8
 
9
9
  end
@@ -1,3 +1,3 @@
1
1
  module DocRipper
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -0,0 +1,17 @@
1
+ require "spec_helper"
2
+
3
+ module DocRipper
4
+ describe "PdfRipper" do
5
+ let(:test_pdf_path) { "#{FIXTURE_PATH}test_pdf.pdf" }
6
+ let(:test_pdf_text) { "A Simple PDF File\nThis is a small demonstration .pdf file just for use in the Virtual Mechanics tutorials. More text. And more\ntext. And more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. Boring, zzzzz. And more text. And more text. And\nmore text. And more text. And more text. And more text. And more text.\nAnd more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. And more text. Even more. Continued on page 2 ...\n\n\fSimple PDF File 2\n...continued from page 1. Yet more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. Oh, how boring typing this stuff. But not as boring as watching\npaint dry. And more text. And more text. And more text. And more text.\nBoring. More, a little more text. The end, and just as well.\n\n\f" }
7
+
8
+ describe "#rip" do
9
+
10
+ let(:ripper) { DocRipper.rip(test_pdf_path) }
11
+
12
+ it "returns correct text from pdf" do
13
+ expect(ripper.gsub(/\s/, " ")).to eq(test_pdf_text.gsub(/\s/, " "))
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,198 @@
1
+ %PDF-1.3
2
+ %����
3
+
4
+ 1 0 obj
5
+ <<
6
+ /Type /Catalog
7
+ /Outlines 2 0 R
8
+ /Pages 3 0 R
9
+ >>
10
+ endobj
11
+
12
+ 2 0 obj
13
+ <<
14
+ /Type /Outlines
15
+ /Count 0
16
+ >>
17
+ endobj
18
+
19
+ 3 0 obj
20
+ <<
21
+ /Type /Pages
22
+ /Count 2
23
+ /Kids [ 4 0 R 6 0 R ]
24
+ >>
25
+ endobj
26
+
27
+ 4 0 obj
28
+ <<
29
+ /Type /Page
30
+ /Parent 3 0 R
31
+ /Resources <<
32
+ /Font <<
33
+ /F1 9 0 R
34
+ >>
35
+ /ProcSet 8 0 R
36
+ >>
37
+ /MediaBox [0 0 612.0000 792.0000]
38
+ /Contents 5 0 R
39
+ >>
40
+ endobj
41
+
42
+ 5 0 obj
43
+ << /Length 1074 >>
44
+ stream
45
+ 2 J
46
+ BT
47
+ 0 0 0 rg
48
+ /F1 0027 Tf
49
+ 57.3750 722.2800 Td
50
+ ( A Simple PDF File ) Tj
51
+ ET
52
+ BT
53
+ /F1 0010 Tf
54
+ 69.2500 688.6080 Td
55
+ ( This is a small demonstration .pdf file - ) Tj
56
+ ET
57
+ BT
58
+ /F1 0010 Tf
59
+ 69.2500 664.7040 Td
60
+ ( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
61
+ ET
62
+ BT
63
+ /F1 0010 Tf
64
+ 69.2500 652.7520 Td
65
+ ( text. And more text. And more text. And more text. ) Tj
66
+ ET
67
+ BT
68
+ /F1 0010 Tf
69
+ 69.2500 628.8480 Td
70
+ ( And more text. And more text. And more text. And more text. And more ) Tj
71
+ ET
72
+ BT
73
+ /F1 0010 Tf
74
+ 69.2500 616.8960 Td
75
+ ( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
76
+ ET
77
+ BT
78
+ /F1 0010 Tf
79
+ 69.2500 604.9440 Td
80
+ ( more text. And more text. And more text. And more text. And more text. ) Tj
81
+ ET
82
+ BT
83
+ /F1 0010 Tf
84
+ 69.2500 592.9920 Td
85
+ ( And more text. And more text. ) Tj
86
+ ET
87
+ BT
88
+ /F1 0010 Tf
89
+ 69.2500 569.0880 Td
90
+ ( And more text. And more text. And more text. And more text. And more ) Tj
91
+ ET
92
+ BT
93
+ /F1 0010 Tf
94
+ 69.2500 557.1360 Td
95
+ ( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
96
+ ET
97
+ endstream
98
+ endobj
99
+
100
+ 6 0 obj
101
+ <<
102
+ /Type /Page
103
+ /Parent 3 0 R
104
+ /Resources <<
105
+ /Font <<
106
+ /F1 9 0 R
107
+ >>
108
+ /ProcSet 8 0 R
109
+ >>
110
+ /MediaBox [0 0 612.0000 792.0000]
111
+ /Contents 7 0 R
112
+ >>
113
+ endobj
114
+
115
+ 7 0 obj
116
+ << /Length 676 >>
117
+ stream
118
+ 2 J
119
+ BT
120
+ 0 0 0 rg
121
+ /F1 0027 Tf
122
+ 57.3750 722.2800 Td
123
+ ( Simple PDF File 2 ) Tj
124
+ ET
125
+ BT
126
+ /F1 0010 Tf
127
+ 69.2500 688.6080 Td
128
+ ( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
129
+ ET
130
+ BT
131
+ /F1 0010 Tf
132
+ 69.2500 676.6560 Td
133
+ ( And more text. And more text. And more text. And more text. And more ) Tj
134
+ ET
135
+ BT
136
+ /F1 0010 Tf
137
+ 69.2500 664.7040 Td
138
+ ( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
139
+ ET
140
+ BT
141
+ /F1 0010 Tf
142
+ 69.2500 652.7520 Td
143
+ ( paint dry. And more text. And more text. And more text. And more text. ) Tj
144
+ ET
145
+ BT
146
+ /F1 0010 Tf
147
+ 69.2500 640.8000 Td
148
+ ( Boring. More, a little more text. The end, and just as well. ) Tj
149
+ ET
150
+ endstream
151
+ endobj
152
+
153
+ 8 0 obj
154
+ [/PDF /Text]
155
+ endobj
156
+
157
+ 9 0 obj
158
+ <<
159
+ /Type /Font
160
+ /Subtype /Type1
161
+ /Name /F1
162
+ /BaseFont /Helvetica
163
+ /Encoding /WinAnsiEncoding
164
+ >>
165
+ endobj
166
+
167
+ 10 0 obj
168
+ <<
169
+ /Creator (Rave \(http://www.nevrona.com/rave\))
170
+ /Producer (Nevrona Designs)
171
+ /CreationDate (D:20060301072826)
172
+ >>
173
+ endobj
174
+
175
+ xref
176
+ 0 11
177
+ 0000000000 65535 f
178
+ 0000000019 00000 n
179
+ 0000000093 00000 n
180
+ 0000000147 00000 n
181
+ 0000000222 00000 n
182
+ 0000000390 00000 n
183
+ 0000001522 00000 n
184
+ 0000001690 00000 n
185
+ 0000002423 00000 n
186
+ 0000002456 00000 n
187
+ 0000002574 00000 n
188
+
189
+ trailer
190
+ <<
191
+ /Size 11
192
+ /Root 1 0 R
193
+ /Info 10 0 R
194
+ >>
195
+
196
+ startxref
197
+ 2714
198
+ %%EOF
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc_ripper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Zaich
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-11-29 00:00:00.000000000 Z
11
+ date: 2019-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: CFPropertyList
@@ -146,12 +146,8 @@ files:
146
146
  - lib/doc_ripper/formats/sketch_ripper.rb
147
147
  - lib/doc_ripper/formats/text_ripper.rb
148
148
  - lib/doc_ripper/version.rb
149
- - pkg/doc_ripper-0.0.5.gem
150
- - pkg/doc_ripper-0.0.6.gem
151
- - pkg/doc_ripper-0.0.7.1.gem
152
- - pkg/doc_ripper-0.0.7.2.gem
153
- - pkg/doc_ripper-0.0.7.gem
154
149
  - spec/doc_ripper/doc_ripper_spec.rb
150
+ - spec/doc_ripper/formats/pdf_ripper_spec.rb
155
151
  - spec/doc_ripper/formats/sketch_ripper_spec.rb
156
152
  - spec/fixtures/chinese.docx
157
153
  - spec/fixtures/complex_sketch_text.sketch
@@ -162,6 +158,7 @@ files:
162
158
  - spec/fixtures/missing_file.txt
163
159
  - spec/fixtures/simple_sketch_text.sketch
164
160
  - spec/fixtures/some_missing_path.txt
161
+ - spec/fixtures/test_pdf.pdf
165
162
  - spec/spec_helper.rb
166
163
  homepage: https://github.com/pzaich/doc_ripper
167
164
  licenses:
@@ -185,12 +182,13 @@ requirements:
185
182
  - Antiword
186
183
  - pdftotext/poppler
187
184
  rubyforge_project:
188
- rubygems_version: 2.6.14
185
+ rubygems_version: 2.6.14.1
189
186
  signing_key:
190
187
  specification_version: 4
191
188
  summary: Rip out text from pdf, doc and docx formats
192
189
  test_files:
193
190
  - spec/doc_ripper/doc_ripper_spec.rb
191
+ - spec/doc_ripper/formats/pdf_ripper_spec.rb
194
192
  - spec/doc_ripper/formats/sketch_ripper_spec.rb
195
193
  - spec/fixtures/chinese.docx
196
194
  - spec/fixtures/complex_sketch_text.sketch
@@ -201,4 +199,5 @@ test_files:
201
199
  - spec/fixtures/missing_file.txt
202
200
  - spec/fixtures/simple_sketch_text.sketch
203
201
  - spec/fixtures/some_missing_path.txt
202
+ - spec/fixtures/test_pdf.pdf
204
203
  - spec/spec_helper.rb
Binary file
Binary file
Binary file
Binary file
Binary file