generalscraper 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6d547a3d1293c92e7f9553422668a0fb0f382b3a
4
- data.tar.gz: 4f060e26095d1abc162d348c1a5f3aa0c334d32b
3
+ metadata.gz: 29d8ecb9bacae3fb5f21d57bd48aa6f7aa7bf9e3
4
+ data.tar.gz: 7bb42fc8560c5be6e65dd93827b4eafbfa647718
5
5
  SHA512:
6
- metadata.gz: 96d5d92f5f376dbbd8a7d42a4e6cb0bffc97768a261fe9d59afd3b56302d5bce98cdc452ad7eebdbcbb540216e459562252ce372ba3419d36df508e9dcd45b06
7
- data.tar.gz: 3a2c4a317a140f6183d0ac9baf2e6cafd1b4777b648f0cc84805a20ba40f87aa8fc932126dca5f93641482df9486e3182073888fd905263d56d3e4dabfbcfce2
6
+ metadata.gz: 02e934121567d5fb9f18392581ac145e94f01ee1473261090d15fd0490abeff8a96888fed556f792e46728725d6d03aa542fb34fc309e0dc66ecc2204d6988c7
7
+ data.tar.gz: c020dcf77f83b20e52325988ad6401a8e315410bbda70d4b5d58842065579cb897bfe6869c9f88eb8e60a89f243835bd4fbbdd5fa4713910b77033684d4b3840
@@ -20,48 +20,75 @@ class GeneralScraper
20
20
  @urllist = Array.new
21
21
  @startindex = 10
22
22
  @use_proxy = use_proxy
23
+
24
+ # Generate driver
25
+ profile = Selenium::WebDriver::Firefox::Profile.new
26
+ profile['intl.accept_languages'] = 'en'
27
+ @driver = Selenium::WebDriver.for :firefox, profile: profile
23
28
  end
24
29
 
25
30
  # Searches for links on Google
26
31
  def search
27
- categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
32
+ categorizeLinks(getPage("http://google.com", @driver, @operators + " " + @searchterm, @use_proxy))
33
+ end
34
+
35
+ # Gets the links from the page
36
+ def getLinks(page)
37
+ # Sleep while things load
38
+ sleep(10)
39
+
40
+ # Extract arr
41
+ return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
42
+ begin
43
+ link_arr.push(al.attribute("href"))
44
+ rescue
45
+
46
+ end
47
+
48
+ link_arr
49
+ end
28
50
  end
29
51
 
30
52
  # Categorizes the links on results page into results and other search pages
31
53
  def categorizeLinks(page)
32
- page.links.each do |link|
33
- if isResultLink?(link)
34
- siteURLSave(link)
35
- elsif isSearchPageLink?(link)
36
- nextSearchPage(link)
54
+ links = getLinks(page)
55
+ links.each do |link|
56
+ if link
57
+ if isResultLink?(link)
58
+ siteURLSave(link)
59
+ elsif isSearchPageLink?(link)
60
+ nextSearchPage(link)
61
+ end
37
62
  end
38
63
  end
39
64
  end
40
65
 
41
66
  # Determines if url is link to search result
42
67
  def isResultLink?(link)
43
- return (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
68
+ return (link.include? @op_val) &&
69
+ (!link.include? "webcache") &&
70
+ (!link.include? @operators.gsub(" ", "+")) &&
71
+ (!link.include?("translate.google"))
44
72
  end
45
73
 
46
74
  # Determines if URL is link to next search page
47
75
  def isSearchPageLink?(link)
48
- return (link.href.include? "&sa=N") && (link.href.include? "&start=")
76
+ return (link.include? "&sa=N") && (link.include? "&start=")
49
77
  end
50
78
 
51
79
 
52
80
  # Parse and save the URLs for search results
53
81
  def siteURLSave(link)
54
- site_url = link.href.split("?q=")[1]
55
- @urllist.push(site_url.split("&")[0]) if site_url
82
+ @urllist.push(link)
56
83
  end
57
84
 
58
85
  # Process search links and go to next page
59
86
  def nextSearchPage(link)
60
- page_index_num = link.href.split("&start=")[1].split("&sa=N")[0]
61
-
87
+ page_index_num = link.split("&start=")[1].split("&sa=N")[0]
88
+
62
89
  if page_index_num.to_i == @startindex
63
90
  @startindex += 10
64
- categorizeLinks(getPage("http://google.com" + link.href + "&filter=0", @use_proxy))
91
+ categorizeLinks(getPage(link, @driver, @use_proxy))
65
92
  end
66
93
  end
67
94
 
@@ -70,14 +97,17 @@ class GeneralScraper
70
97
  def getData
71
98
  search
72
99
  @urllist.each do |url|
73
- getPageData(url)
100
+ getPageData(url, @driver)
74
101
  end
102
+ @driver.close
75
103
  return JSON.pretty_generate(@output)
76
104
  end
77
105
 
78
106
  # Returns a list of search result URLs
79
107
  def getURLs
80
108
  search
109
+ @driver.close
81
110
  return JSON.pretty_generate(@urllist)
82
111
  end
83
112
  end
113
+
data/lib/parse_page.rb CHANGED
@@ -2,9 +2,10 @@ require 'uploadconvert'
2
2
 
3
3
  module ParsePage
4
4
  # Get both page metadata and text
5
- def getPageData(url)
5
+ def getPageData(url, driver)
6
6
  begin
7
- html = Nokogiri::HTML(getPage(url).body)
7
+ page = getPage(url, driver, nil, 5, false)
8
+ html = Nokogiri::HTML(page.page_source)
8
9
  pagehash = getMetadata(url, html)
9
10
  pagehash = getContent(url, pagehash, html)
10
11
  @output.push(pagehash)
data/lib/proxy_manager.rb CHANGED
@@ -1,10 +1,11 @@
1
1
  require 'active_support/time'
2
2
  require 'mechanize'
3
3
  require 'uri'
4
+ require 'selenium-webdriver'
4
5
 
5
6
  module ProxyManager
6
7
  # Get the page with a proxy
7
- def getPage(url, form_input = nil, fail_count = 0, use_proxy)
8
+ def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
8
9
  agent = Mechanize.new do |a|
9
10
  a.user_agent_alias = "Linux Firefox"
10
11
 
@@ -19,12 +20,17 @@ module ProxyManager
19
20
  # Slightly different based on filling in form or not
20
21
  begin
21
22
  if form_input
22
- gform = agent.get(url).form("f")
23
- gform.q = form_input
24
- return agent.submit(gform, gform.buttons.first)
23
+ driver.navigate.to url
24
+ element = driver.find_element(name: "q")
25
+ element.send_keys form_input
26
+ element.submit
27
+ puts "Searched for: " + form_input
28
+
29
+ return driver
25
30
  else
26
31
  puts "Getting page " + url
27
- return agent.get(url)
32
+ driver.navigate.to url
33
+ return driver
28
34
  end
29
35
  rescue # Only retry request 10 times
30
36
  puts "FAILED"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath