generalscraper 0.0.22 → 0.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/parse_page.rb +12 -11
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 264365c642b1995ce23c7cc565a395fc56fcb9c7
4
- data.tar.gz: 99b7f3830bc03a3d93ae08540ac6aa0192e28199
3
+ metadata.gz: 85b3d91e96159d5f3cd36961664721d9bd5e7313
4
+ data.tar.gz: 7568e30d7343d9be690e48e0369f7cb3db194a81
5
5
  SHA512:
6
- metadata.gz: 97a3705bd544858a13bc89efa05018502e5395a1a0a4c68ed25fd49944d0338a74e621503547f7828f5faabe40888a542ce4b837e3a8c51abce04af88317845d
7
- data.tar.gz: cd9c7d9d4d6673a6f27f3e9dd275f14e478e10694250f5e2c621307f249394d050688d56248c0a726266705c42ee25fd37fba7c70e45e045f1cf3afffdbf1e16
6
+ metadata.gz: b88b2d814a08bc24b68ed337e4e973471a57c02d1fa9323156ccd7a93f5dab754dde734e48326378e203e5c11599ee1e0be789d45eb728016feb194b0949094b
7
+ data.tar.gz: 5f32fd7d6da2aea69a4654a2e3e3662bcbce88412252a4b366048ffe14d02e0d33a7f57b653138b9de9eb1a7f2117cf38023072b8936a9cbd10e0e5ed7f001b7
data/lib/parse_page.rb CHANGED
@@ -3,20 +3,21 @@ require 'uploadconvert'
3
3
  module ParsePage
4
4
  # Get both page metadata and text
5
5
  def getPageData(url)
6
- begin
7
- page = @requests.get_page(url)
8
- html = Nokogiri::HTML(page)
9
- pagehash = getMetadata(url, html)
10
- pagehash = getContent(url, pagehash, html)
11
- return pagehash
12
- rescue
13
- end
6
+ page = @requests.get_page(url)
7
+ html = Nokogiri::HTML(page)
8
+ pagehash = getMetadata(url, html)
9
+ pagehash = getContent(url, pagehash, html)
10
+ return pagehash
14
11
  end
15
12
 
16
13
  # Get the page content by type of page
17
14
  def getContent(url, pagehash, html)
18
15
  if url.include? ".pdf"
19
- return getPDF(url, pagehash)
16
+ begin
17
+ return getPDF(url, pagehash)
18
+ rescue
19
+ return nil
20
+ end
20
21
  else
21
22
  return getHTMLText(url, pagehash, html)
22
23
  end
@@ -30,7 +31,7 @@ module ParsePage
30
31
 
31
32
  # Download and extract text from PDF
32
33
  def getPDF(url, pagehash)
33
- `wget -P public/uploads #{url}`
34
+ `wget --tries=2 -P public/uploads #{url}`
34
35
  path = url.split("/")
35
36
 
36
37
  # OCR PDF and save fields
@@ -51,7 +52,7 @@ module ParsePage
51
52
  pagehash[:date_retrieved] = Time.now
52
53
 
53
54
  # Get title and meta tag info
54
- pagehash[:title] = fixEncode(html.css("title").text)
55
+ pagehash[:page_title] = fixEncode(html.css("title").text)
55
56
  html.css("meta").each do |m|
56
57
  if m
57
58
  pagehash[m['name']] = fixEncode(m['content'])
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.22
4
+ version: 0.0.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath