generalscraper 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6d547a3d1293c92e7f9553422668a0fb0f382b3a
4
- data.tar.gz: 4f060e26095d1abc162d348c1a5f3aa0c334d32b
3
+ metadata.gz: 29d8ecb9bacae3fb5f21d57bd48aa6f7aa7bf9e3
4
+ data.tar.gz: 7bb42fc8560c5be6e65dd93827b4eafbfa647718
5
5
  SHA512:
6
- metadata.gz: 96d5d92f5f376dbbd8a7d42a4e6cb0bffc97768a261fe9d59afd3b56302d5bce98cdc452ad7eebdbcbb540216e459562252ce372ba3419d36df508e9dcd45b06
7
- data.tar.gz: 3a2c4a317a140f6183d0ac9baf2e6cafd1b4777b648f0cc84805a20ba40f87aa8fc932126dca5f93641482df9486e3182073888fd905263d56d3e4dabfbcfce2
6
+ metadata.gz: 02e934121567d5fb9f18392581ac145e94f01ee1473261090d15fd0490abeff8a96888fed556f792e46728725d6d03aa542fb34fc309e0dc66ecc2204d6988c7
7
+ data.tar.gz: c020dcf77f83b20e52325988ad6401a8e315410bbda70d4b5d58842065579cb897bfe6869c9f88eb8e60a89f243835bd4fbbdd5fa4713910b77033684d4b3840
@@ -20,48 +20,75 @@ class GeneralScraper
20
20
  @urllist = Array.new
21
21
  @startindex = 10
22
22
  @use_proxy = use_proxy
23
+
24
+ # Generate driver
25
+ profile = Selenium::WebDriver::Firefox::Profile.new
26
+ profile['intl.accept_languages'] = 'en'
27
+ @driver = Selenium::WebDriver.for :firefox, profile: profile
23
28
  end
24
29
 
25
30
  # Searches for links on Google
26
31
  def search
27
- categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
32
+ categorizeLinks(getPage("http://google.com", @driver, @operators + " " + @searchterm, @use_proxy))
33
+ end
34
+
35
+ # Gets the links from the page
36
+ def getLinks(page)
37
+ # Sleep while things load
38
+ sleep(10)
39
+
40
+ # Extract arr
41
+ return page.find_elements(css: "a").inject(Array.new) do |link_arr, al|
42
+ begin
43
+ link_arr.push(al.attribute("href"))
44
+ rescue
45
+
46
+ end
47
+
48
+ link_arr
49
+ end
28
50
  end
29
51
 
30
52
  # Categorizes the links on results page into results and other search pages
31
53
  def categorizeLinks(page)
32
- page.links.each do |link|
33
- if isResultLink?(link)
34
- siteURLSave(link)
35
- elsif isSearchPageLink?(link)
36
- nextSearchPage(link)
54
+ links = getLinks(page)
55
+ links.each do |link|
56
+ if link
57
+ if isResultLink?(link)
58
+ siteURLSave(link)
59
+ elsif isSearchPageLink?(link)
60
+ nextSearchPage(link)
61
+ end
37
62
  end
38
63
  end
39
64
  end
40
65
 
41
66
  # Determines if url is link to search result
42
67
  def isResultLink?(link)
43
- return (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
68
+ return (link.include? @op_val) &&
69
+ (!link.include? "webcache") &&
70
+ (!link.include? @operators.gsub(" ", "+")) &&
71
+ (!link.include?("translate.google"))
44
72
  end
45
73
 
46
74
  # Determines if URL is link to next search page
47
75
  def isSearchPageLink?(link)
48
- return (link.href.include? "&sa=N") && (link.href.include? "&start=")
76
+ return (link.include? "&sa=N") && (link.include? "&start=")
49
77
  end
50
78
 
51
79
 
52
80
  # Parse and save the URLs for search results
53
81
  def siteURLSave(link)
54
- site_url = link.href.split("?q=")[1]
55
- @urllist.push(site_url.split("&")[0]) if site_url
82
+ @urllist.push(link)
56
83
  end
57
84
 
58
85
  # Process search links and go to next page
59
86
  def nextSearchPage(link)
60
- page_index_num = link.href.split("&start=")[1].split("&sa=N")[0]
61
-
87
+ page_index_num = link.split("&start=")[1].split("&sa=N")[0]
88
+
62
89
  if page_index_num.to_i == @startindex
63
90
  @startindex += 10
64
- categorizeLinks(getPage("http://google.com" + link.href + "&filter=0", @use_proxy))
91
+ categorizeLinks(getPage(link, @driver, @use_proxy))
65
92
  end
66
93
  end
67
94
 
@@ -70,14 +97,17 @@ class GeneralScraper
70
97
  def getData
71
98
  search
72
99
  @urllist.each do |url|
73
- getPageData(url)
100
+ getPageData(url, @driver)
74
101
  end
102
+ @driver.close
75
103
  return JSON.pretty_generate(@output)
76
104
  end
77
105
 
78
106
  # Returns a list of search result URLs
79
107
  def getURLs
80
108
  search
109
+ @driver.close
81
110
  return JSON.pretty_generate(@urllist)
82
111
  end
83
112
  end
113
+
data/lib/parse_page.rb CHANGED
@@ -2,9 +2,10 @@ require 'uploadconvert'
2
2
 
3
3
  module ParsePage
4
4
  # Get both page metadata and text
5
- def getPageData(url)
5
+ def getPageData(url, driver)
6
6
  begin
7
- html = Nokogiri::HTML(getPage(url).body)
7
+ page = getPage(url, driver, nil, 5, false)
8
+ html = Nokogiri::HTML(page.page_source)
8
9
  pagehash = getMetadata(url, html)
9
10
  pagehash = getContent(url, pagehash, html)
10
11
  @output.push(pagehash)
data/lib/proxy_manager.rb CHANGED
@@ -1,10 +1,11 @@
1
1
  require 'active_support/time'
2
2
  require 'mechanize'
3
3
  require 'uri'
4
+ require 'selenium-webdriver'
4
5
 
5
6
  module ProxyManager
6
7
  # Get the page with a proxy
7
- def getPage(url, form_input = nil, fail_count = 0, use_proxy)
8
+ def getPage(url, driver, form_input = nil, fail_count = 0, use_proxy)
8
9
  agent = Mechanize.new do |a|
9
10
  a.user_agent_alias = "Linux Firefox"
10
11
 
@@ -19,12 +20,17 @@ module ProxyManager
19
20
  # Slightly different based on filling in form or not
20
21
  begin
21
22
  if form_input
22
- gform = agent.get(url).form("f")
23
- gform.q = form_input
24
- return agent.submit(gform, gform.buttons.first)
23
+ driver.navigate.to url
24
+ element = driver.find_element(name: "q")
25
+ element.send_keys form_input
26
+ element.submit
27
+ puts "Searched for: " + form_input
28
+
29
+ return driver
25
30
  else
26
31
  puts "Getting page " + url
27
- return agent.get(url)
32
+ driver.navigate.to url
33
+ return driver
28
34
  end
29
35
  rescue # Only retry request 10 times
30
36
  puts "FAILED"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath