generalscraper 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 338629c39c095b6f64780ab9d13f5372234f8ed8
4
- data.tar.gz: ae8586f85b7c14cb1b6e0e44dd7eb90b590d6e8c
3
+ metadata.gz: a23d6483229cb3d18a14d8ba906658779f1a80d2
4
+ data.tar.gz: 0a719a46f8b3091880419495ad8e746d3e53e722
5
5
  SHA512:
6
- metadata.gz: 7d3c7370d236d984f0a6e49790f1bda765ad817b1d6a3680d90d8c682c992aeeb7723e64d7a4c0d2f1e761b5403851313f814d1537d4741ba7d75951305fee28
7
- data.tar.gz: a2dd8f531f2a9fbcc61bc0f206765da233fead65bfa0ac0e045472c0af465e7758d822e1b0bacea473498ee7e45caa6492223eadc1cd5d9d9c19ffa3329a572c
6
+ metadata.gz: 9554472410fb5879f28de9fdd3e55625c2d5016435bf3df4ad314d6f2cb9b77101fde746f97d651575e006f49d3825256095ea6b73d1154d5072d5cee8ee12c1
7
+ data.tar.gz: 76f39348dde21f700ce560346243b5b18e857340ce59a973cb23c6735d89ea626d93a3e1cd0d3c6027e201e74d2fef1762a640f0c3cbb35792f89c5300def338
@@ -1,46 +1,50 @@
1
1
  require 'json'
2
2
  require 'nokogiri'
3
3
  require 'mechanize'
4
+ require 'requestmanager'
5
+ require 'pry'
4
6
 
5
7
  load 'parse_page.rb'
6
- load 'proxy_manager.rb'
7
8
 
8
9
  class GeneralScraper
9
10
  include ParsePage
10
- include ProxyManager
11
11
 
12
- def initialize(operators, searchterm, proxylist, use_proxy)
12
+ def initialize(operators, searchterm, proxylist)
13
13
  @operators = operators
14
14
  @searchterm = searchterm
15
15
  @op_val = @operators.split(" ")[0].split(":")[1]
16
- @proxylist = IO.readlines(proxylist)
17
- @usedproxies = Hash.new
16
+ @proxylist = proxylist
17
+ @requests = RequestManager.new(@proxylist, [4, 15], 1)
18
18
 
19
19
  @output = Array.new
20
20
  @urllist = Array.new
21
21
  @startindex = 10
22
- @use_proxy = use_proxy
23
-
24
- # Generate driver
25
- profile = Selenium::WebDriver::Firefox::Profile.new
26
- profile['intl.accept_languages'] = 'en'
27
- @driver = Selenium::WebDriver.for :firefox, profile: profile
28
22
  end
29
23
 
30
24
  # Searches for links on Google
31
25
  def search
32
- categorizeLinks(getPage("http://google.com", @driver, @operators + " " + @searchterm, @use_proxy))
26
+ check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
27
+ "http://google.com", (@operators + " " + @searchterm))
28
+ end
29
+
30
+ # Check that page with links loaded
31
+ def check_results(page, *requested_page)
32
+ if page.include?("To continue, please type the characters below:")
33
+ @requests.restart_browser
34
+ check_results(@requests.get_page(requested_page), requested_page)
35
+ else
36
+ categorizeLinks(page)
37
+ end
33
38
  end
34
39
 
35
40
  # Gets the links from the page
36
- def getLinks(page)
37
- # Sleep while things load
38
- sleep(10)
39
-
40
- # Extract arr
41
- return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
41
+ def getLinks(page)
42
+ html = Nokogiri::HTML(page)
43
+
44
+ # Get array of links
45
+ return html.css("a").inject(Array.new) do |link_arr, al|
42
46
  begin
43
- link_arr.push(al.attribute("href"))
47
+ link_arr.push(al["href"])
44
48
  rescue
45
49
 
46
50
  end
@@ -52,12 +56,14 @@ class GeneralScraper
52
56
  # Categorizes the links on results page into results and other search pages
53
57
  def categorizeLinks(page)
54
58
  links = getLinks(page)
59
+
60
+ # Categorize as results or search pages
55
61
  links.each do |link|
56
62
  if link
57
63
  if isResultLink?(link)
58
64
  siteURLSave(link)
59
65
  elsif isSearchPageLink?(link)
60
- nextSearchPage(link)
66
+ nextSearchPage("google.com"+link)
61
67
  end
62
68
  end
63
69
  end
@@ -88,26 +94,25 @@ class GeneralScraper
88
94
 
89
95
  if page_index_num.to_i == @startindex
90
96
  @startindex += 10
91
- categorizeLinks(getPage(link, @driver, @use_proxy))
97
+ check_results(@requests.get_page(link), link)
92
98
  end
93
99
  end
94
100
 
95
-
96
101
  # Gets all data and returns in JSON
97
102
  def getData
98
103
  search
99
104
  @urllist.each do |url|
100
- getPageData(url, @driver)
105
+ getPageData(url)
101
106
  end
102
- @driver.close
107
+
108
+ @requests.close_all_browsers
103
109
  return JSON.pretty_generate(@output)
104
110
  end
105
111
 
106
112
  # Returns a list of search result URLs
107
113
  def getURLs
108
114
  search
109
- @driver.close
115
+ @requests.close_all_browsers
110
116
  return JSON.pretty_generate(@urllist)
111
117
  end
112
118
  end
113
-
data/lib/parse_page.rb CHANGED
@@ -2,10 +2,10 @@ require 'uploadconvert'
2
2
 
3
3
  module ParsePage
4
4
  # Get both page metadata and text
5
- def getPageData(url, driver)
5
+ def getPageData(url)
6
6
  begin
7
- page = getPage(url, driver, nil, 5, false)
8
- html = Nokogiri::HTML(page.page_source)
7
+ page = @requests.get_page(url)
8
+ html = Nokogiri::HTML(page)
9
9
  pagehash = getMetadata(url, html)
10
10
  pagehash = getContent(url, pagehash, html)
11
11
  @output.push(pagehash)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-21 00:00:00.000000000 Z
11
+ date: 2015-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com
@@ -18,7 +18,6 @@ extra_rdoc_files: []
18
18
  files:
19
19
  - lib/generalscraper.rb
20
20
  - lib/parse_page.rb
21
- - lib/proxy_manager.rb
22
21
  homepage: https://github.com/TransparencyToolkit/generalscraper
23
22
  licenses:
24
23
  - GPL
@@ -39,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
38
  version: '0'
40
39
  requirements: []
41
40
  rubyforge_project:
42
- rubygems_version: 2.4.6
41
+ rubygems_version: 2.4.8
43
42
  signing_key:
44
43
  specification_version: 4
45
44
  summary: Scrapes Google
46
45
  test_files: []
46
+ has_rdoc:
data/lib/proxy_manager.rb DELETED
@@ -1,70 +0,0 @@
1
- require 'active_support/time'
2
- require 'mechanize'
3
- require 'uri'
4
- require 'selenium-webdriver'
5
-
6
- module ProxyManager
7
- # Get the page with a proxy
8
- def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
9
- agent = Mechanize.new do |a|
10
- a.user_agent_alias = "Linux Firefox"
11
-
12
- # Set proxy if specified, otherwise delay to avoid blocks
13
- if use_proxy
14
- a.set_proxy(*getRandomProxy(url))
15
- else
16
- sleep(rand(30..90))
17
- end
18
- end
19
-
20
- # Slightly different based on filling in form or not
21
- begin
22
- if form_input
23
- driver.navigate.to url
24
- element = driver.find_element(name: "q")
25
- element.send_keys form_input
26
- element.submit
27
- puts "Searched for: " + form_input
28
-
29
- return driver
30
- else
31
- puts "Getting page " + url
32
- driver.navigate.to url
33
- return driver
34
- end
35
- rescue # Only retry request 10 times
36
- begin
37
- puts "FAILED"
38
- getPage(url, form_input, fail_count+=1) if fail_count < 10
39
- rescue
40
- end
41
- end
42
- end
43
-
44
- # Choose a random proxy
45
- def getRandomProxy(url)
46
- max = @proxylist.length
47
- chosen = @proxylist[Random.rand(max)]
48
-
49
- # Only use proxy if it hasn't been used in last 20 seconds on same host
50
- if isNotUsed?(chosen, url)
51
- @usedproxies[chosen] = [Time.now, URI.parse(url).host]
52
- return parseProxy(chosen)
53
- else
54
- sleep(0.005)
55
- getRandomProxy(url)
56
- end
57
- end
58
-
59
- # Splits up proxy into IP, port, user, password
60
- def parseProxy(chosen)
61
- proxy_info = chosen.split(":")
62
- proxy_info[proxy_info.length-1] = proxy_info.last.strip
63
- return proxy_info
64
- end
65
-
66
- # Checks if a proxy has been used on domain in the last 20 seconds
67
- def isNotUsed?(chosen, url)
68
- return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
69
- end
70
- end