generalscraper 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 338629c39c095b6f64780ab9d13f5372234f8ed8
4
- data.tar.gz: ae8586f85b7c14cb1b6e0e44dd7eb90b590d6e8c
3
+ metadata.gz: a23d6483229cb3d18a14d8ba906658779f1a80d2
4
+ data.tar.gz: 0a719a46f8b3091880419495ad8e746d3e53e722
5
5
  SHA512:
6
- metadata.gz: 7d3c7370d236d984f0a6e49790f1bda765ad817b1d6a3680d90d8c682c992aeeb7723e64d7a4c0d2f1e761b5403851313f814d1537d4741ba7d75951305fee28
7
- data.tar.gz: a2dd8f531f2a9fbcc61bc0f206765da233fead65bfa0ac0e045472c0af465e7758d822e1b0bacea473498ee7e45caa6492223eadc1cd5d9d9c19ffa3329a572c
6
+ metadata.gz: 9554472410fb5879f28de9fdd3e55625c2d5016435bf3df4ad314d6f2cb9b77101fde746f97d651575e006f49d3825256095ea6b73d1154d5072d5cee8ee12c1
7
+ data.tar.gz: 76f39348dde21f700ce560346243b5b18e857340ce59a973cb23c6735d89ea626d93a3e1cd0d3c6027e201e74d2fef1762a640f0c3cbb35792f89c5300def338
@@ -1,46 +1,50 @@
1
1
  require 'json'
2
2
  require 'nokogiri'
3
3
  require 'mechanize'
4
+ require 'requestmanager'
5
+ require 'pry'
4
6
 
5
7
  load 'parse_page.rb'
6
- load 'proxy_manager.rb'
7
8
 
8
9
  class GeneralScraper
9
10
  include ParsePage
10
- include ProxyManager
11
11
 
12
- def initialize(operators, searchterm, proxylist, use_proxy)
12
+ def initialize(operators, searchterm, proxylist)
13
13
  @operators = operators
14
14
  @searchterm = searchterm
15
15
  @op_val = @operators.split(" ")[0].split(":")[1]
16
- @proxylist = IO.readlines(proxylist)
17
- @usedproxies = Hash.new
16
+ @proxylist = proxylist
17
+ @requests = RequestManager.new(@proxylist, [4, 15], 1)
18
18
 
19
19
  @output = Array.new
20
20
  @urllist = Array.new
21
21
  @startindex = 10
22
- @use_proxy = use_proxy
23
-
24
- # Generate driver
25
- profile = Selenium::WebDriver::Firefox::Profile.new
26
- profile['intl.accept_languages'] = 'en'
27
- @driver = Selenium::WebDriver.for :firefox, profile: profile
28
22
  end
29
23
 
30
24
  # Searches for links on Google
31
25
  def search
32
- categorizeLinks(getPage("http://google.com", @driver, @operators + " " + @searchterm, @use_proxy))
26
+ check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
27
+ "http://google.com", (@operators + " " + @searchterm))
28
+ end
29
+
30
+ # Check that page with links loaded
31
+ def check_results(page, *requested_page)
32
+ if page.include?("To continue, please type the characters below:")
33
+ @requests.restart_browser
34
+ check_results(@requests.get_page(requested_page), requested_page)
35
+ else
36
+ categorizeLinks(page)
37
+ end
33
38
  end
34
39
 
35
40
  # Gets the links from the page
36
- def getLinks(page)
37
- # Sleep while things load
38
- sleep(10)
39
-
40
- # Extract arr
41
- return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
41
+ def getLinks(page)
42
+ html = Nokogiri::HTML(page)
43
+
44
+ # Get array of links
45
+ return html.css("a").inject(Array.new) do |link_arr, al|
42
46
  begin
43
- link_arr.push(al.attribute("href"))
47
+ link_arr.push(al["href"])
44
48
  rescue
45
49
 
46
50
  end
@@ -52,12 +56,14 @@ class GeneralScraper
52
56
  # Categorizes the links on results page into results and other search pages
53
57
  def categorizeLinks(page)
54
58
  links = getLinks(page)
59
+
60
+ # Categorize as results or search pages
55
61
  links.each do |link|
56
62
  if link
57
63
  if isResultLink?(link)
58
64
  siteURLSave(link)
59
65
  elsif isSearchPageLink?(link)
60
- nextSearchPage(link)
66
+ nextSearchPage("google.com"+link)
61
67
  end
62
68
  end
63
69
  end
@@ -88,26 +94,25 @@ class GeneralScraper
88
94
 
89
95
  if page_index_num.to_i == @startindex
90
96
  @startindex += 10
91
- categorizeLinks(getPage(link, @driver, @use_proxy))
97
+ check_results(@requests.get_page(link), link)
92
98
  end
93
99
  end
94
100
 
95
-
96
101
  # Gets all data and returns in JSON
97
102
  def getData
98
103
  search
99
104
  @urllist.each do |url|
100
- getPageData(url, @driver)
105
+ getPageData(url)
101
106
  end
102
- @driver.close
107
+
108
+ @requests.close_all_browsers
103
109
  return JSON.pretty_generate(@output)
104
110
  end
105
111
 
106
112
  # Returns a list of search result URLs
107
113
  def getURLs
108
114
  search
109
- @driver.close
115
+ @requests.close_all_browsers
110
116
  return JSON.pretty_generate(@urllist)
111
117
  end
112
118
  end
113
-
data/lib/parse_page.rb CHANGED
@@ -2,10 +2,10 @@ require 'uploadconvert'
2
2
 
3
3
  module ParsePage
4
4
  # Get both page metadata and text
5
- def getPageData(url, driver)
5
+ def getPageData(url)
6
6
  begin
7
- page = getPage(url, driver, nil, 5, false)
8
- html = Nokogiri::HTML(page.page_source)
7
+ page = @requests.get_page(url)
8
+ html = Nokogiri::HTML(page)
9
9
  pagehash = getMetadata(url, html)
10
10
  pagehash = getContent(url, pagehash, html)
11
11
  @output.push(pagehash)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-21 00:00:00.000000000 Z
11
+ date: 2015-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com
@@ -18,7 +18,6 @@ extra_rdoc_files: []
18
18
  files:
19
19
  - lib/generalscraper.rb
20
20
  - lib/parse_page.rb
21
- - lib/proxy_manager.rb
22
21
  homepage: https://github.com/TransparencyToolkit/generalscraper
23
22
  licenses:
24
23
  - GPL
@@ -39,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
38
  version: '0'
40
39
  requirements: []
41
40
  rubyforge_project:
42
- rubygems_version: 2.4.6
41
+ rubygems_version: 2.4.8
43
42
  signing_key:
44
43
  specification_version: 4
45
44
  summary: Scrapes Google
46
45
  test_files: []
46
+ has_rdoc:
data/lib/proxy_manager.rb DELETED
@@ -1,70 +0,0 @@
1
- require 'active_support/time'
2
- require 'mechanize'
3
- require 'uri'
4
- require 'selenium-webdriver'
5
-
6
- module ProxyManager
7
- # Get the page with a proxy
8
- def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
9
- agent = Mechanize.new do |a|
10
- a.user_agent_alias = "Linux Firefox"
11
-
12
- # Set proxy if specified, otherwise delay to avoid blocks
13
- if use_proxy
14
- a.set_proxy(*getRandomProxy(url))
15
- else
16
- sleep(rand(30..90))
17
- end
18
- end
19
-
20
- # Slightly different based on filling in form or not
21
- begin
22
- if form_input
23
- driver.navigate.to url
24
- element = driver.find_element(name: "q")
25
- element.send_keys form_input
26
- element.submit
27
- puts "Searched for: " + form_input
28
-
29
- return driver
30
- else
31
- puts "Getting page " + url
32
- driver.navigate.to url
33
- return driver
34
- end
35
- rescue # Only retry request 10 times
36
- begin
37
- puts "FAILED"
38
- getPage(url, form_input, fail_count+=1) if fail_count < 10
39
- rescue
40
- end
41
- end
42
- end
43
-
44
- # Choose a random proxy
45
- def getRandomProxy(url)
46
- max = @proxylist.length
47
- chosen = @proxylist[Random.rand(max)]
48
-
49
- # Only use proxy if it hasn't been used in last 20 seconds on same host
50
- if isNotUsed?(chosen, url)
51
- @usedproxies[chosen] = [Time.now, URI.parse(url).host]
52
- return parseProxy(chosen)
53
- else
54
- sleep(0.005)
55
- getRandomProxy(url)
56
- end
57
- end
58
-
59
- # Splits up proxy into IP, port, user, password
60
- def parseProxy(chosen)
61
- proxy_info = chosen.split(":")
62
- proxy_info[proxy_info.length-1] = proxy_info.last.strip
63
- return proxy_info
64
- end
65
-
66
- # Checks if a proxy has been used on domain in the last 20 seconds
67
- def isNotUsed?(chosen, url)
68
- return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
69
- end
70
- end