generalscraper 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/parse_page.rb +3 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 21df27ba7416ab3ea410f3c38f0aea43cfa0c5f0
|
4
|
+
data.tar.gz: bbdfaa98d9c7c0600dc626b43da0127a43472b36
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f9c37e1e151b37d4eb231fb22304d9f7868eed8a02cb874aa9968756be0f2ad2f555f36c2e8a02977c353579232c6c7a40ee776236ec425e2bef28959f6ce80a
|
7
|
+
data.tar.gz: 57d4a622ed823a0acad91bea00a787e2f341721ed110c3d5a88b93f9c981e267a14d62a0db1c71993a10dd9042d77496833724a5f893123288b15f9d9faf9223
|
data/lib/parse_page.rb
CHANGED
@@ -23,7 +23,7 @@ module ParsePage
|
|
23
23
|
# Download the page text
|
24
24
|
def getHTMLText(url, pagehash)
|
25
25
|
html = Nokogiri::HTML(getPage(url).body)
|
26
|
-
pagehash[:text] = html.css("body").text
|
26
|
+
pagehash[:text] = html.css("body").text.encode("UTF-8")
|
27
27
|
return pagehash
|
28
28
|
end
|
29
29
|
|
@@ -35,7 +35,7 @@ module ParsePage
|
|
35
35
|
# OCR PDF and save fields
|
36
36
|
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
37
37
|
pdfparse = JSON.parse(u.handleDoc)
|
38
|
-
pdfparse.each{|k, v| pagehash[k] = v}
|
38
|
+
pdfparse.each{|k, v| pagehash[k] = v.encode("UTF-8")}
|
39
39
|
return pagehash
|
40
40
|
end
|
41
41
|
|
@@ -51,7 +51,7 @@ module ParsePage
|
|
51
51
|
|
52
52
|
# Get title and meta tag info
|
53
53
|
html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
|
54
|
-
pagehash[:title] = html.css("title").text
|
54
|
+
pagehash[:title] = html.css("title").text.encode("UTF-8")
|
55
55
|
html.css("meta").each do |m|
|
56
56
|
if m
|
57
57
|
pagehash[m['name']] = m['content']
|