generalscraper 0.0.22 → 0.0.23
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/parse_page.rb +12 -11
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85b3d91e96159d5f3cd36961664721d9bd5e7313
|
4
|
+
data.tar.gz: 7568e30d7343d9be690e48e0369f7cb3db194a81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b88b2d814a08bc24b68ed337e4e973471a57c02d1fa9323156ccd7a93f5dab754dde734e48326378e203e5c11599ee1e0be789d45eb728016feb194b0949094b
|
7
|
+
data.tar.gz: 5f32fd7d6da2aea69a4654a2e3e3662bcbce88412252a4b366048ffe14d02e0d33a7f57b653138b9de9eb1a7f2117cf38023072b8936a9cbd10e0e5ed7f001b7
|
data/lib/parse_page.rb
CHANGED
@@ -3,20 +3,21 @@ require 'uploadconvert'
|
|
3
3
|
module ParsePage
|
4
4
|
# Get both page metadata and text
|
5
5
|
def getPageData(url)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
return pagehash
|
12
|
-
rescue
|
13
|
-
end
|
6
|
+
page = @requests.get_page(url)
|
7
|
+
html = Nokogiri::HTML(page)
|
8
|
+
pagehash = getMetadata(url, html)
|
9
|
+
pagehash = getContent(url, pagehash, html)
|
10
|
+
return pagehash
|
14
11
|
end
|
15
12
|
|
16
13
|
# Get the page content by type of page
|
17
14
|
def getContent(url, pagehash, html)
|
18
15
|
if url.include? ".pdf"
|
19
|
-
|
16
|
+
begin
|
17
|
+
return getPDF(url, pagehash)
|
18
|
+
rescue
|
19
|
+
return nil
|
20
|
+
end
|
20
21
|
else
|
21
22
|
return getHTMLText(url, pagehash, html)
|
22
23
|
end
|
@@ -30,7 +31,7 @@ module ParsePage
|
|
30
31
|
|
31
32
|
# Download and extract text from PDF
|
32
33
|
def getPDF(url, pagehash)
|
33
|
-
`wget -P public/uploads #{url}`
|
34
|
+
`wget --tries=2 -P public/uploads #{url}`
|
34
35
|
path = url.split("/")
|
35
36
|
|
36
37
|
# OCR PDF and save fields
|
@@ -51,7 +52,7 @@ module ParsePage
|
|
51
52
|
pagehash[:date_retrieved] = Time.now
|
52
53
|
|
53
54
|
# Get title and meta tag info
|
54
|
-
pagehash[:
|
55
|
+
pagehash[:page_title] = fixEncode(html.css("title").text)
|
55
56
|
html.css("meta").each do |m|
|
56
57
|
if m
|
57
58
|
pagehash[m['name']] = fixEncode(m['content'])
|