generalscraper 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/generalscraper.rb +11 -2
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ef817246382783da5c99aaf13b1d1a03ebc87ce6
4
- data.tar.gz: 9c41b5cf1c937e4ce9826ba7bd619e380277d309
3
+ metadata.gz: 84330f1a3a5c18dd9b2d8b200b141b5ba7c85827
4
+ data.tar.gz: 5fe4f81df62c28962565340858984e3d6b6f8a86
5
5
  SHA512:
6
- metadata.gz: 73967c97043bbd9c79849b036285cf41a187f654a456f53d56b1ecea95042f2db0020eecba0a48366390b6d08f28ae95c8c179815608def58691e38520363a55
7
- data.tar.gz: 2d9a966ecc6a6389a730044fd7c94c6a9588fdaa8297d8bea11ac048c30321c8e9b7c76fca0c4bb56e1e1408669e8a27c628e367c8c236dc06ebb3a85470071f
6
+ metadata.gz: 58b032cb6a3f33f4e5f1d972147952906ca95be1fbbbf181afad95336cc4a24b7585733305e8eefe9bce38da77cb989510a41020b9d418d93b68c7ef2fc8c1e1
7
+ data.tar.gz: 401093d88ca984e4a3fdc5f39f3ec35be9441fdff15e23fd434d623203ec22a9687f98d5fcfdd021c90662eb791f8ec34e57d45d59d08b5364b626a1961d6865
@@ -2,6 +2,7 @@ require 'mechanize'
2
2
  require 'json'
3
3
  require 'nokogiri'
4
4
  require 'open-uri'
5
+ require 'uploadconvert'
5
6
 
6
7
  class GeneralScraper
7
8
  def initialize(scrapesite, input, table)
@@ -64,11 +65,19 @@ class GeneralScraper
64
65
  end
65
66
  end
66
67
  if @table == false
67
- pagehash[:page] = html.css("body").text
68
+ if url.include? ".pdf"
69
+ `wget -P public/uploads #{url}`
70
+ path = url.split("/")
71
+ u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
72
+ pdfparse = JSON.parse(u.handleDoc)
73
+ pdfparse.each{|k, v| pagehash[k] = v}
74
+ else
75
+ pagehash[:text] = html.css("body").text
76
+ end
68
77
  end
69
78
  @output.push(pagehash)
70
79
  rescue
71
- puts "URL: " + url
80
+
72
81
  end
73
82
  end
74
83
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-15 00:00:00.000000000 Z
11
+ date: 2014-05-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes all pages on a site you specify including terms you specify.
14
14
  email: shidash@shidash.com