generalscraper 0.0.25 → 0.0.26
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/parse_page.rb +4 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85a128c0ae855a52ee3842ba40807b7adc84a43a
|
4
|
+
data.tar.gz: c7b8aa3648f3735c0f69ac7f91884bbd04a2450a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54c19ac3c90c3b99b9be0115945c1f59ad71951285a5d779090f4ec91c3e4fd2fbf995c4393a27cc0d73d0efee515d62182b2851cb6205f7f4dfe2a0f17fb84d
|
7
|
+
data.tar.gz: 0ea35567e7c33f9b46ab678f2bf3231d8a7b79a883e749728de0eaa9d46e772e91cc4346bc4ce3c8ff4f9a4855b0acd06cc338e63920c1228a7e5b2ef5f36837
|
data/lib/parse_page.rb
CHANGED
@@ -31,11 +31,12 @@ module ParsePage
|
|
31
31
|
|
32
32
|
# Download and extract text from PDF
|
33
33
|
def getPDF(url, pagehash)
|
34
|
-
`wget --tries=2 -P public/uploads #{url}`
|
35
34
|
path = url.split("/")
|
36
|
-
|
35
|
+
filename = path[path.length-1].chomp.strip.gsub(" ", "_").gsub("%20", "_")
|
36
|
+
`wget --tries=2 #{url} -O public/uploads/#{filename}`
|
37
|
+
|
37
38
|
# OCR PDF and save fields
|
38
|
-
u = UploadConvert.new("public/uploads/" +
|
39
|
+
u = UploadConvert.new("public/uploads/" + filename)
|
39
40
|
pdfparse = JSON.parse(u.handleDoc)
|
40
41
|
pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
|
41
42
|
return pagehash
|