generalscraper 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 21df27ba7416ab3ea410f3c38f0aea43cfa0c5f0
4
- data.tar.gz: bbdfaa98d9c7c0600dc626b43da0127a43472b36
3
+ metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
4
+ data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
5
5
  SHA512:
6
- metadata.gz: f9c37e1e151b37d4eb231fb22304d9f7868eed8a02cb874aa9968756be0f2ad2f555f36c2e8a02977c353579232c6c7a40ee776236ec425e2bef28959f6ce80a
7
- data.tar.gz: 57d4a622ed823a0acad91bea00a787e2f341721ed110c3d5a88b93f9c981e267a14d62a0db1c71993a10dd9042d77496833724a5f893123288b15f9d9faf9223
6
+ metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
7
+ data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9
@@ -29,14 +29,25 @@ class GeneralScraper
29
29
  # Categorizes the links on results page into results and other search pages
30
30
  def categorizeLinks(page)
31
31
  page.links.each do |link|
32
- if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
32
+ if isResultLink?(link)
33
33
  siteURLSave(link)
34
- elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
34
+ elsif isSearchPageLink?(link)
35
35
  nextSearchPage(link)
36
36
  end
37
37
  end
38
38
  end
39
39
 
40
+ # Determines if url is link to search result
41
+ def isResultLink?(link)
42
+ return (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
43
+ end
44
+
45
+ # Determines if URL is link to next search page
46
+ def isSearchPageLink?(link)
47
+ return (link.href.include? "&sa=N") && (link.href.include? "&start=")
48
+ end
49
+
50
+
40
51
  # Parse and save the URLs for search results
41
52
  def siteURLSave(link)
42
53
  site_url = link.href.split("?q=")[1]
@@ -53,6 +64,7 @@ class GeneralScraper
53
64
  end
54
65
  end
55
66
 
67
+
56
68
  # Gets all data and returns in JSON
57
69
  def getData
58
70
  search
data/lib/parse_page.rb CHANGED
@@ -4,26 +4,26 @@ module ParsePage
4
4
  # Get both page metadata and text
5
5
  def getPageData(url)
6
6
  begin
7
- pagehash = getMetadata(url)
8
- pagehash = getContent(url, pagehash)
7
+ html = Nokogiri::HTML(getPage(url).body)
8
+ pagehash = getMetadata(url, html)
9
+ pagehash = getContent(url, pagehash, html)
9
10
  @output.push(pagehash)
10
11
  rescue
11
12
  end
12
13
  end
13
14
 
14
15
  # Get the page content by type of page
15
- def getContent(url, pagehash)
16
+ def getContent(url, pagehash, html)
16
17
  if url.include? ".pdf"
17
18
  return getPDF(url, pagehash)
18
19
  else
19
- return getHTMLText(url, pagehash)
20
+ return getHTMLText(url, pagehash, html)
20
21
  end
21
22
  end
22
23
 
23
24
  # Download the page text
24
- def getHTMLText(url, pagehash)
25
- html = Nokogiri::HTML(getPage(url).body)
26
- pagehash[:text] = html.css("body").text.encode("UTF-8")
25
+ def getHTMLText(url, pagehash, html)
26
+ pagehash[:text] = fixEncode(html.css("body").text)
27
27
  return pagehash
28
28
  end
29
29
 
@@ -35,12 +35,12 @@ module ParsePage
35
35
  # OCR PDF and save fields
36
36
  u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
37
37
  pdfparse = JSON.parse(u.handleDoc)
38
- pdfparse.each{|k, v| pagehash[k] = v.encode("UTF-8")}
38
+ pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
39
39
  return pagehash
40
40
  end
41
41
 
42
42
  # Get the page metadata
43
- def getMetadata(url)
43
+ def getMetadata(url, html)
44
44
  pagehash = Hash.new
45
45
 
46
46
  # Save URL and date retreived
@@ -50,14 +50,21 @@ module ParsePage
50
50
  pagehash[:date_retrieved] = Time.now
51
51
 
52
52
  # Get title and meta tag info
53
- html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
54
- pagehash[:title] = html.css("title").text.encode("UTF-8")
53
+ pagehash[:title] = fixEncode(html.css("title").text)
55
54
  html.css("meta").each do |m|
56
55
  if m
57
- pagehash[m['name']] = m['content']
56
+ pagehash[m['name']] = fixEncode(m['content'])
58
57
  end
59
58
  end
60
59
 
61
60
  return pagehash
62
61
  end
62
+
63
+ def fixEncode(str)
64
+ if str.is_a?(String)
65
+ return str.unpack('C*').pack('U*')
66
+ else
67
+ return str
68
+ end
69
+ end
63
70
  end
data/lib/proxy_manager.rb CHANGED
@@ -1,35 +1,53 @@
1
1
  require 'active_support/time'
2
2
  require 'mechanize'
3
+ require 'uri'
3
4
 
4
5
  module ProxyManager
5
6
  # Get the page with a proxy
6
- def getPage(url, form_input = nil)
7
+ def getPage(url, form_input = nil, fail_count = 0)
7
8
  agent = Mechanize.new do |a|
8
9
  a.user_agent_alias = "Linux Firefox"
9
- a.set_proxy(getRandomProxy, 80)
10
+ a.set_proxy(*getRandomProxy(url))
10
11
  end
11
12
 
12
- if form_input
13
- gform = agent.get(url).form("f")
14
- gform.q = form_input
15
- return agent.submit(gform, gform.buttons.first)
16
- else
17
- return agent.get(url)
13
+ # Slightly different based on filling in form or not
14
+ begin
15
+ if form_input
16
+ gform = agent.get(url).form("f")
17
+ gform.q = form_input
18
+ return agent.submit(gform, gform.buttons.first)
19
+ else
20
+ return agent.get(url)
21
+ end
22
+ rescue # Only retry request 5 times
23
+ getPage(url, form_input, fail_count+=1) if fail_count < 5
18
24
  end
19
25
  end
20
26
 
21
27
  # Choose a random proxy
22
- def getRandomProxy
28
+ def getRandomProxy(url)
23
29
  max = @proxylist.length
24
30
  chosen = @proxylist[Random.rand(max)]
25
31
 
26
- # Only use proxy if it hasn't been used in last 20 seconds
27
- if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
28
- @usedproxies[chosen] = Time.now
29
- return chosen
32
+ # Only use proxy if it hasn't been used in last 20 seconds on same host
33
+ if isNotUsed?(chosen, url)
34
+ @usedproxies[chosen] = [Time.now, URI.parse(url).host]
35
+ return parseProxy(chosen)
30
36
  else
31
- sleep(0.5)
32
- getRandomProxy
37
+ sleep(0.005)
38
+ getRandomProxy(url)
33
39
  end
34
40
  end
41
+
42
+ # Splits up proxy into IP, port, user, password
43
+ def parseProxy(chosen)
44
+ proxy_info = chosen.split(":")
45
+ proxy_info[proxy_info.length-1] = proxy_info.last.strip
46
+ return proxy_info
47
+ end
48
+
49
+ # Checks if a proxy has been used on domain in the last 20 seconds
50
+ def isNotUsed?(chosen, url)
51
+ return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
52
+ end
35
53
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-07 00:00:00.000000000 Z
11
+ date: 2014-04-10 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com