generalscraper 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 21df27ba7416ab3ea410f3c38f0aea43cfa0c5f0
4
- data.tar.gz: bbdfaa98d9c7c0600dc626b43da0127a43472b36
3
+ metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
4
+ data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
5
5
  SHA512:
6
- metadata.gz: f9c37e1e151b37d4eb231fb22304d9f7868eed8a02cb874aa9968756be0f2ad2f555f36c2e8a02977c353579232c6c7a40ee776236ec425e2bef28959f6ce80a
7
- data.tar.gz: 57d4a622ed823a0acad91bea00a787e2f341721ed110c3d5a88b93f9c981e267a14d62a0db1c71993a10dd9042d77496833724a5f893123288b15f9d9faf9223
6
+ metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
7
+ data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9
@@ -29,14 +29,25 @@ class GeneralScraper
29
29
  # Categorizes the links on results page into results and other search pages
30
30
  def categorizeLinks(page)
31
31
  page.links.each do |link|
32
- if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
32
+ if isResultLink?(link)
33
33
  siteURLSave(link)
34
- elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
34
+ elsif isSearchPageLink?(link)
35
35
  nextSearchPage(link)
36
36
  end
37
37
  end
38
38
  end
39
39
 
40
+ # Determines if url is link to search result
41
+ def isResultLink?(link)
42
+ return (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
43
+ end
44
+
45
+ # Determines if URL is link to next search page
46
+ def isSearchPageLink?(link)
47
+ return (link.href.include? "&sa=N") && (link.href.include? "&start=")
48
+ end
49
+
50
+
40
51
  # Parse and save the URLs for search results
41
52
  def siteURLSave(link)
42
53
  site_url = link.href.split("?q=")[1]
@@ -53,6 +64,7 @@ class GeneralScraper
53
64
  end
54
65
  end
55
66
 
67
+
56
68
  # Gets all data and returns in JSON
57
69
  def getData
58
70
  search
data/lib/parse_page.rb CHANGED
@@ -4,26 +4,26 @@ module ParsePage
4
4
  # Get both page metadata and text
5
5
  def getPageData(url)
6
6
  begin
7
- pagehash = getMetadata(url)
8
- pagehash = getContent(url, pagehash)
7
+ html = Nokogiri::HTML(getPage(url).body)
8
+ pagehash = getMetadata(url, html)
9
+ pagehash = getContent(url, pagehash, html)
9
10
  @output.push(pagehash)
10
11
  rescue
11
12
  end
12
13
  end
13
14
 
14
15
  # Get the page content by type of page
15
- def getContent(url, pagehash)
16
+ def getContent(url, pagehash, html)
16
17
  if url.include? ".pdf"
17
18
  return getPDF(url, pagehash)
18
19
  else
19
- return getHTMLText(url, pagehash)
20
+ return getHTMLText(url, pagehash, html)
20
21
  end
21
22
  end
22
23
 
23
24
  # Download the page text
24
- def getHTMLText(url, pagehash)
25
- html = Nokogiri::HTML(getPage(url).body)
26
- pagehash[:text] = html.css("body").text.encode("UTF-8")
25
+ def getHTMLText(url, pagehash, html)
26
+ pagehash[:text] = fixEncode(html.css("body").text)
27
27
  return pagehash
28
28
  end
29
29
 
@@ -35,12 +35,12 @@ module ParsePage
35
35
  # OCR PDF and save fields
36
36
  u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
37
37
  pdfparse = JSON.parse(u.handleDoc)
38
- pdfparse.each{|k, v| pagehash[k] = v.encode("UTF-8")}
38
+ pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
39
39
  return pagehash
40
40
  end
41
41
 
42
42
  # Get the page metadata
43
- def getMetadata(url)
43
+ def getMetadata(url, html)
44
44
  pagehash = Hash.new
45
45
 
46
46
  # Save URL and date retreived
@@ -50,14 +50,21 @@ module ParsePage
50
50
  pagehash[:date_retrieved] = Time.now
51
51
 
52
52
  # Get title and meta tag info
53
- html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
54
- pagehash[:title] = html.css("title").text.encode("UTF-8")
53
+ pagehash[:title] = fixEncode(html.css("title").text)
55
54
  html.css("meta").each do |m|
56
55
  if m
57
- pagehash[m['name']] = m['content']
56
+ pagehash[m['name']] = fixEncode(m['content'])
58
57
  end
59
58
  end
60
59
 
61
60
  return pagehash
62
61
  end
62
+
63
+ def fixEncode(str)
64
+ if str.is_a?(String)
65
+ return str.unpack('C*').pack('U*')
66
+ else
67
+ return str
68
+ end
69
+ end
63
70
  end
data/lib/proxy_manager.rb CHANGED
@@ -1,35 +1,53 @@
1
1
  require 'active_support/time'
2
2
  require 'mechanize'
3
+ require 'uri'
3
4
 
4
5
  module ProxyManager
5
6
  # Get the page with a proxy
6
- def getPage(url, form_input = nil)
7
+ def getPage(url, form_input = nil, fail_count = 0)
7
8
  agent = Mechanize.new do |a|
8
9
  a.user_agent_alias = "Linux Firefox"
9
- a.set_proxy(getRandomProxy, 80)
10
+ a.set_proxy(*getRandomProxy(url))
10
11
  end
11
12
 
12
- if form_input
13
- gform = agent.get(url).form("f")
14
- gform.q = form_input
15
- return agent.submit(gform, gform.buttons.first)
16
- else
17
- return agent.get(url)
13
+ # Slightly different based on filling in form or not
14
+ begin
15
+ if form_input
16
+ gform = agent.get(url).form("f")
17
+ gform.q = form_input
18
+ return agent.submit(gform, gform.buttons.first)
19
+ else
20
+ return agent.get(url)
21
+ end
22
+ rescue # Only retry request 5 times
23
+ getPage(url, form_input, fail_count+=1) if fail_count < 5
18
24
  end
19
25
  end
20
26
 
21
27
  # Choose a random proxy
22
- def getRandomProxy
28
+ def getRandomProxy(url)
23
29
  max = @proxylist.length
24
30
  chosen = @proxylist[Random.rand(max)]
25
31
 
26
- # Only use proxy if it hasn't been used in last 20 seconds
27
- if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
28
- @usedproxies[chosen] = Time.now
29
- return chosen
32
+ # Only use proxy if it hasn't been used in last 20 seconds on same host
33
+ if isNotUsed?(chosen, url)
34
+ @usedproxies[chosen] = [Time.now, URI.parse(url).host]
35
+ return parseProxy(chosen)
30
36
  else
31
- sleep(0.5)
32
- getRandomProxy
37
+ sleep(0.005)
38
+ getRandomProxy(url)
33
39
  end
34
40
  end
41
+
42
+ # Splits up proxy into IP, port, user, password
43
+ def parseProxy(chosen)
44
+ proxy_info = chosen.split(":")
45
+ proxy_info[proxy_info.length-1] = proxy_info.last.strip
46
+ return proxy_info
47
+ end
48
+
49
+ # Checks if a proxy has been used on domain in the last 20 seconds
50
+ def isNotUsed?(chosen, url)
51
+ return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
52
+ end
35
53
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-07 00:00:00.000000000 Z
11
+ date: 2014-04-10 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com