generalscraper 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/generalscraper.rb +11 -2
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ef817246382783da5c99aaf13b1d1a03ebc87ce6
4
- data.tar.gz: 9c41b5cf1c937e4ce9826ba7bd619e380277d309
3
+ metadata.gz: 84330f1a3a5c18dd9b2d8b200b141b5ba7c85827
4
+ data.tar.gz: 5fe4f81df62c28962565340858984e3d6b6f8a86
5
5
  SHA512:
6
- metadata.gz: 73967c97043bbd9c79849b036285cf41a187f654a456f53d56b1ecea95042f2db0020eecba0a48366390b6d08f28ae95c8c179815608def58691e38520363a55
7
- data.tar.gz: 2d9a966ecc6a6389a730044fd7c94c6a9588fdaa8297d8bea11ac048c30321c8e9b7c76fca0c4bb56e1e1408669e8a27c628e367c8c236dc06ebb3a85470071f
6
+ metadata.gz: 58b032cb6a3f33f4e5f1d972147952906ca95be1fbbbf181afad95336cc4a24b7585733305e8eefe9bce38da77cb989510a41020b9d418d93b68c7ef2fc8c1e1
7
+ data.tar.gz: 401093d88ca984e4a3fdc5f39f3ec35be9441fdff15e23fd434d623203ec22a9687f98d5fcfdd021c90662eb791f8ec34e57d45d59d08b5364b626a1961d6865
@@ -2,6 +2,7 @@ require 'mechanize'
2
2
  require 'json'
3
3
  require 'nokogiri'
4
4
  require 'open-uri'
5
+ require 'uploadconvert'
5
6
 
6
7
  class GeneralScraper
7
8
  def initialize(scrapesite, input, table)
@@ -64,11 +65,19 @@ class GeneralScraper
64
65
  end
65
66
  end
66
67
  if @table == false
67
- pagehash[:page] = html.css("body").text
68
+ if url.include? ".pdf"
69
+ `wget -P public/uploads #{url}`
70
+ path = url.split("/")
71
+ u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
72
+ pdfparse = JSON.parse(u.handleDoc)
73
+ pdfparse.each{|k, v| pagehash[k] = v}
74
+ else
75
+ pagehash[:text] = html.css("body").text
76
+ end
68
77
  end
69
78
  @output.push(pagehash)
70
79
  rescue
71
- puts "URL: " + url
80
+
72
81
  end
73
82
  end
74
83
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-15 00:00:00.000000000 Z
11
+ date: 2014-05-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes all pages on a site you specify including terms you specify.
14
14
  email: shidash@shidash.com