generalscraper 0.0.22 → 0.0.23

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/parse_page.rb +12 -11
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 264365c642b1995ce23c7cc565a395fc56fcb9c7
4
- data.tar.gz: 99b7f3830bc03a3d93ae08540ac6aa0192e28199
3
+ metadata.gz: 85b3d91e96159d5f3cd36961664721d9bd5e7313
4
+ data.tar.gz: 7568e30d7343d9be690e48e0369f7cb3db194a81
5
5
  SHA512:
6
- metadata.gz: 97a3705bd544858a13bc89efa05018502e5395a1a0a4c68ed25fd49944d0338a74e621503547f7828f5faabe40888a542ce4b837e3a8c51abce04af88317845d
7
- data.tar.gz: cd9c7d9d4d6673a6f27f3e9dd275f14e478e10694250f5e2c621307f249394d050688d56248c0a726266705c42ee25fd37fba7c70e45e045f1cf3afffdbf1e16
6
+ metadata.gz: b88b2d814a08bc24b68ed337e4e973471a57c02d1fa9323156ccd7a93f5dab754dde734e48326378e203e5c11599ee1e0be789d45eb728016feb194b0949094b
7
+ data.tar.gz: 5f32fd7d6da2aea69a4654a2e3e3662bcbce88412252a4b366048ffe14d02e0d33a7f57b653138b9de9eb1a7f2117cf38023072b8936a9cbd10e0e5ed7f001b7
data/lib/parse_page.rb CHANGED
@@ -3,20 +3,21 @@ require 'uploadconvert'
3
3
  module ParsePage
4
4
  # Get both page metadata and text
5
5
  def getPageData(url)
6
- begin
7
- page = @requests.get_page(url)
8
- html = Nokogiri::HTML(page)
9
- pagehash = getMetadata(url, html)
10
- pagehash = getContent(url, pagehash, html)
11
- return pagehash
12
- rescue
13
- end
6
+ page = @requests.get_page(url)
7
+ html = Nokogiri::HTML(page)
8
+ pagehash = getMetadata(url, html)
9
+ pagehash = getContent(url, pagehash, html)
10
+ return pagehash
14
11
  end
15
12
 
16
13
  # Get the page content by type of page
17
14
  def getContent(url, pagehash, html)
18
15
  if url.include? ".pdf"
19
- return getPDF(url, pagehash)
16
+ begin
17
+ return getPDF(url, pagehash)
18
+ rescue
19
+ return nil
20
+ end
20
21
  else
21
22
  return getHTMLText(url, pagehash, html)
22
23
  end
@@ -30,7 +31,7 @@ module ParsePage
30
31
 
31
32
  # Download and extract text from PDF
32
33
  def getPDF(url, pagehash)
33
- `wget -P public/uploads #{url}`
34
+ `wget --tries=2 -P public/uploads #{url}`
34
35
  path = url.split("/")
35
36
 
36
37
  # OCR PDF and save fields
@@ -51,7 +52,7 @@ module ParsePage
51
52
  pagehash[:date_retrieved] = Time.now
52
53
 
53
54
  # Get title and meta tag info
54
- pagehash[:title] = fixEncode(html.css("title").text)
55
+ pagehash[:page_title] = fixEncode(html.css("title").text)
55
56
  html.css("meta").each do |m|
56
57
  if m
57
58
  pagehash[m['name']] = fixEncode(m['content'])
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.22
4
+ version: 0.0.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath