generalscraper 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/captcha.rb +56 -0
  3. data/lib/generalscraper.rb +33 -35
  4. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5790426632941446e267e863ba2141ad14399267
4
- data.tar.gz: 498067c4274cb8ba6d7cdfa7991ba0b3e8efa708
3
+ metadata.gz: 32261924b8307b97e84ae2ba6c884ac6d3caa93c
4
+ data.tar.gz: f888607279ef76b7f72fabe9726e602c3bcafad7
5
5
  SHA512:
6
- metadata.gz: 2b6fd4f0dba9a1fa8d48a7f50483066b90703314c725d6852ffda24703c8097e515caac71e2116ebd9f1937b131fcc9851559abd5a375335cfcdb3a87b0ef9ec
7
- data.tar.gz: 9478594515f1002611c3ec2b06467d0f5a284bd8a282a449e98fc1433252dfa805b5cff716fb98c85757e417fe211868351b1bea7871847aafb049ce0d591fdd
6
+ metadata.gz: 46ef701e1b891cc9d75517d90710c458ef377902454827830633bedfc7c117769255d694a175a20085c43a9b969d2d0375b8442132bca0447cc45613cfebf313
7
+ data.tar.gz: 4c5d2c254af2d02616c5441b8213b9ebbdf82dd1f82e0b4c91edbca0cf9f16e4b7c732e419fbbe7d0655b08c783b9070c87ce97eb0c83f03ee92f7043a0efceb
data/lib/captcha.rb ADDED
@@ -0,0 +1,56 @@
1
+ require 'rmagick'
2
+ require 'imgurr'
3
+ require 'curb'
4
+ require 'two_captcha'
5
+ include Magick
6
+
7
+ class Captcha
8
+ def initialize(requests, solver_details)
9
+ @requests = requests
10
+ @captcha_key = solver_details[:captcha_key]
11
+ end
12
+
13
+ # Solves the captcha
14
+ def solve
15
+ take_screenshot
16
+ crop_screenshot
17
+ @captcha_solution = get_captcha_solved
18
+ submit_captcha_solution
19
+ delete_screenshots
20
+ end
21
+
22
+ # Have the captcha solved
23
+ def get_captcha_solved
24
+ client = TwoCaptcha.new(@captcha_key)
25
+ captcha = client.decode!(file: File.open(@time_name+"_cropped.png"))
26
+ return captcha.text
27
+ end
28
+
29
+ # Submit the captcha solution
30
+ def submit_captcha_solution
31
+ browser = @requests.get_most_recent_browser[1][0]
32
+ element = browser.find_element(id: "captcha")
33
+ element.send_keys(@captcha_solution)
34
+ element.submit
35
+ end
36
+
37
+ # Takes a screenshot of captcha in browser
38
+ def take_screenshot
39
+ @time_name = Time.now.to_s.gsub(" ", "").gsub("-", "").gsub(":", "").gsub("-", "")
40
+ @requests.get_most_recent_browser[1][0].save_screenshot(@time_name+".png")
41
+ end
42
+
43
+ # Crops the screenshot to be mostly just the CAPTCHA
44
+ def crop_screenshot
45
+ captcha_image = Image.read(@time_name+".png").first
46
+ width = captcha_image.columns
47
+ height = captcha_image.rows
48
+ cropped_captcha = captcha_image.crop(0, 0, width, height/2)
49
+ cropped_captcha.write(@time_name+"_cropped.png")
50
+ end
51
+
52
+ # Deletes the screenshot images
53
+ def delete_screenshots
54
+ File.delete(@time_name+".png", @time_name+"_cropped.png")
55
+ end
56
+ end
@@ -5,15 +5,17 @@ require 'requestmanager'
5
5
  require 'pry'
6
6
 
7
7
  load 'parse_page.rb'
8
+ load 'captcha.rb'
8
9
 
9
10
  class GeneralScraper
10
11
  include ParsePage
11
12
 
12
- def initialize(operators, searchterm, requests)
13
+ def initialize(operators, searchterm, requests, solver_details)
13
14
  @operators = operators
14
15
  @searchterm = searchterm
15
16
  @op_val = @operators.split(" ")[0].split(":")[1]
16
17
  @requests = requests
18
+ @solver_details = solver_details
17
19
 
18
20
  @output = Array.new
19
21
  @urllist = Array.new
@@ -29,19 +31,30 @@ class GeneralScraper
29
31
  # Check that page with links loaded
30
32
  def check_results(page, *requested_page)
31
33
  if page.include?("To continue, please type the characters below:")
32
- @requests.restart_browser
33
- check_results(@requests.get_page(requested_page), requested_page)
34
- else
35
- categorizeLinks(page)
34
+ # Solve CAPTCHA if enabled
35
+ if @solver_details
36
+ c = Captcha.new(@requests, @solver_details)
37
+ c.solve
38
+
39
+ # Proceed as normal
40
+ sleep(1)
41
+ check_results(@requests.get_updated_current_page)
42
+
43
+ else # Restart and try again if CAPTCHA-solving not enabled
44
+ @requests.restart_browser
45
+ check_results(@requests.get_page(requested_page), requested_page)
46
+ end
47
+ else # No CAPTCHA found :)
48
+ navigate_save_results(page)
36
49
  end
37
50
  end
38
51
 
39
- # Gets the links from the page
40
- def getLinks(page)
52
+ # Gets the links from the page that match css selector in block
53
+ def get_links(page, &block)
41
54
  html = Nokogiri::HTML(page)
42
55
 
43
56
  # Get array of links
44
- return html.css("a").inject(Array.new) do |link_arr, al|
57
+ return yield(html).inject(Array.new) do |link_arr, al|
45
58
  begin
46
59
  link_arr.push(al["href"])
47
60
  rescue
@@ -53,42 +66,27 @@ class GeneralScraper
53
66
  end
54
67
 
55
68
  # Categorizes the links on results page into results and other search pages
56
- def categorizeLinks(page)
57
- links = getLinks(page)
58
-
59
- # Categorize as results or search pages
60
- links.each do |link|
61
- if link
62
- if isResultLink?(link)
63
- siteURLSave(link)
64
- elsif isSearchPageLink?(link)
65
- nextSearchPage("google.com"+link)
66
- end
67
- end
69
+ def navigate_save_results(page)
70
+ # Save result links for page
71
+ result_links = get_links(page) {|html| html.css("h3.r").css("a")}
72
+ result_links.each do |link|
73
+ site_url_save(link)
68
74
  end
69
- end
70
-
71
- # Determines if url is link to search result
72
- def isResultLink?(link)
73
- return (link.include? @op_val) &&
74
- (!link.include? "webcache") &&
75
- (!link.include? @operators.gsub(" ", "+")) &&
76
- (!link.include?("translate.google"))
77
- end
78
75
 
79
- # Determines if URL is link to next search page
80
- def isSearchPageLink?(link)
81
- return (link.include? "&sa=N") && (link.include? "&start=")
76
+ # Go to next page
77
+ next_pages = get_links(page) {|html| html.css("#pnnext")}
78
+ next_pages.each do |link|
79
+ next_search_page("google.com"+link)
80
+ end
82
81
  end
83
-
84
82
 
85
83
  # Parse and save the URLs for search results
86
- def siteURLSave(link)
84
+ def site_url_save(link)
87
85
  @urllist.push(link)
88
86
  end
89
87
 
90
88
  # Process search links and go to next page
91
- def nextSearchPage(link)
89
+ def next_search_page(link)
92
90
  page_index_num = link.split("&start=")[1].split("&sa=N")[0]
93
91
 
94
92
  if page_index_num.to_i == @startindex
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-23 00:00:00.000000000 Z
11
+ date: 2015-12-05 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com
@@ -16,6 +16,7 @@ executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
+ - lib/captcha.rb
19
20
  - lib/generalscraper.rb
20
21
  - lib/parse_page.rb
21
22
  homepage: https://github.com/TransparencyToolkit/generalscraper