generalscraper 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +11 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84330f1a3a5c18dd9b2d8b200b141b5ba7c85827
|
4
|
+
data.tar.gz: 5fe4f81df62c28962565340858984e3d6b6f8a86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 58b032cb6a3f33f4e5f1d972147952906ca95be1fbbbf181afad95336cc4a24b7585733305e8eefe9bce38da77cb989510a41020b9d418d93b68c7ef2fc8c1e1
|
7
|
+
data.tar.gz: 401093d88ca984e4a3fdc5f39f3ec35be9441fdff15e23fd434d623203ec22a9687f98d5fcfdd021c90662eb791f8ec34e57d45d59d08b5364b626a1961d6865
|
data/lib/generalscraper.rb
CHANGED
@@ -2,6 +2,7 @@ require 'mechanize'
|
|
2
2
|
require 'json'
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'open-uri'
|
5
|
+
require 'uploadconvert'
|
5
6
|
|
6
7
|
class GeneralScraper
|
7
8
|
def initialize(scrapesite, input, table)
|
@@ -64,11 +65,19 @@ class GeneralScraper
|
|
64
65
|
end
|
65
66
|
end
|
66
67
|
if @table == false
|
67
|
-
|
68
|
+
if url.include? ".pdf"
|
69
|
+
`wget -P public/uploads #{url}`
|
70
|
+
path = url.split("/")
|
71
|
+
u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
|
72
|
+
pdfparse = JSON.parse(u.handleDoc)
|
73
|
+
pdfparse.each{|k, v| pagehash[k] = v}
|
74
|
+
else
|
75
|
+
pagehash[:text] = html.css("body").text
|
76
|
+
end
|
68
77
|
end
|
69
78
|
@output.push(pagehash)
|
70
79
|
rescue
|
71
|
-
|
80
|
+
|
72
81
|
end
|
73
82
|
end
|
74
83
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes all pages on a site you specify including terms you specify.
|
14
14
|
email: shidash@shidash.com
|