generalscraper 0.0.15 → 0.0.16

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/captcha.rb +56 -0
  3. data/lib/generalscraper.rb +33 -35
  4. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5790426632941446e267e863ba2141ad14399267
4
- data.tar.gz: 498067c4274cb8ba6d7cdfa7991ba0b3e8efa708
3
+ metadata.gz: 32261924b8307b97e84ae2ba6c884ac6d3caa93c
4
+ data.tar.gz: f888607279ef76b7f72fabe9726e602c3bcafad7
5
5
  SHA512:
6
- metadata.gz: 2b6fd4f0dba9a1fa8d48a7f50483066b90703314c725d6852ffda24703c8097e515caac71e2116ebd9f1937b131fcc9851559abd5a375335cfcdb3a87b0ef9ec
7
- data.tar.gz: 9478594515f1002611c3ec2b06467d0f5a284bd8a282a449e98fc1433252dfa805b5cff716fb98c85757e417fe211868351b1bea7871847aafb049ce0d591fdd
6
+ metadata.gz: 46ef701e1b891cc9d75517d90710c458ef377902454827830633bedfc7c117769255d694a175a20085c43a9b969d2d0375b8442132bca0447cc45613cfebf313
7
+ data.tar.gz: 4c5d2c254af2d02616c5441b8213b9ebbdf82dd1f82e0b4c91edbca0cf9f16e4b7c732e419fbbe7d0655b08c783b9070c87ce97eb0c83f03ee92f7043a0efceb
data/lib/captcha.rb ADDED
@@ -0,0 +1,56 @@
1
+ require 'rmagick'
2
+ require 'imgurr'
3
+ require 'curb'
4
+ require 'two_captcha'
5
+ include Magick
6
+
7
+ class Captcha
8
+ def initialize(requests, solver_details)
9
+ @requests = requests
10
+ @captcha_key = solver_details[:captcha_key]
11
+ end
12
+
13
+ # Solves the captcha
14
+ def solve
15
+ take_screenshot
16
+ crop_screenshot
17
+ @captcha_solution = get_captcha_solved
18
+ submit_captcha_solution
19
+ delete_screenshots
20
+ end
21
+
22
+ # Have the captcha solved
23
+ def get_captcha_solved
24
+ client = TwoCaptcha.new(@captcha_key)
25
+ captcha = client.decode!(file: File.open(@time_name+"_cropped.png"))
26
+ return captcha.text
27
+ end
28
+
29
+ # Submit the captcha solution
30
+ def submit_captcha_solution
31
+ browser = @requests.get_most_recent_browser[1][0]
32
+ element = browser.find_element(id: "captcha")
33
+ element.send_keys(@captcha_solution)
34
+ element.submit
35
+ end
36
+
37
+ # Takes a screenshot of captcha in browser
38
+ def take_screenshot
39
+ @time_name = Time.now.to_s.gsub(" ", "").gsub("-", "").gsub(":", "").gsub("-", "")
40
+ @requests.get_most_recent_browser[1][0].save_screenshot(@time_name+".png")
41
+ end
42
+
43
+ # Crops the screenshot to be mostly just the CAPTCHA
44
+ def crop_screenshot
45
+ captcha_image = Image.read(@time_name+".png").first
46
+ width = captcha_image.columns
47
+ height = captcha_image.rows
48
+ cropped_captcha = captcha_image.crop(0, 0, width, height/2)
49
+ cropped_captcha.write(@time_name+"_cropped.png")
50
+ end
51
+
52
+ # Deletes the screenshot images
53
+ def delete_screenshots
54
+ File.delete(@time_name+".png", @time_name+"_cropped.png")
55
+ end
56
+ end
@@ -5,15 +5,17 @@ require 'requestmanager'
5
5
  require 'pry'
6
6
 
7
7
  load 'parse_page.rb'
8
+ load 'captcha.rb'
8
9
 
9
10
  class GeneralScraper
10
11
  include ParsePage
11
12
 
12
- def initialize(operators, searchterm, requests)
13
+ def initialize(operators, searchterm, requests, solver_details)
13
14
  @operators = operators
14
15
  @searchterm = searchterm
15
16
  @op_val = @operators.split(" ")[0].split(":")[1]
16
17
  @requests = requests
18
+ @solver_details = solver_details
17
19
 
18
20
  @output = Array.new
19
21
  @urllist = Array.new
@@ -29,19 +31,30 @@ class GeneralScraper
29
31
  # Check that page with links loaded
30
32
  def check_results(page, *requested_page)
31
33
  if page.include?("To continue, please type the characters below:")
32
- @requests.restart_browser
33
- check_results(@requests.get_page(requested_page), requested_page)
34
- else
35
- categorizeLinks(page)
34
+ # Solve CAPTCHA if enabled
35
+ if @solver_details
36
+ c = Captcha.new(@requests, @solver_details)
37
+ c.solve
38
+
39
+ # Proceed as normal
40
+ sleep(1)
41
+ check_results(@requests.get_updated_current_page)
42
+
43
+ else # Restart and try again if CAPTCHA-solving not enabled
44
+ @requests.restart_browser
45
+ check_results(@requests.get_page(requested_page), requested_page)
46
+ end
47
+ else # No CAPTCHA found :)
48
+ navigate_save_results(page)
36
49
  end
37
50
  end
38
51
 
39
- # Gets the links from the page
40
- def getLinks(page)
52
+ # Gets the links from the page that match css selector in block
53
+ def get_links(page, &block)
41
54
  html = Nokogiri::HTML(page)
42
55
 
43
56
  # Get array of links
44
- return html.css("a").inject(Array.new) do |link_arr, al|
57
+ return yield(html).inject(Array.new) do |link_arr, al|
45
58
  begin
46
59
  link_arr.push(al["href"])
47
60
  rescue
@@ -53,42 +66,27 @@ class GeneralScraper
53
66
  end
54
67
 
55
68
  # Categorizes the links on results page into results and other search pages
56
- def categorizeLinks(page)
57
- links = getLinks(page)
58
-
59
- # Categorize as results or search pages
60
- links.each do |link|
61
- if link
62
- if isResultLink?(link)
63
- siteURLSave(link)
64
- elsif isSearchPageLink?(link)
65
- nextSearchPage("google.com"+link)
66
- end
67
- end
69
+ def navigate_save_results(page)
70
+ # Save result links for page
71
+ result_links = get_links(page) {|html| html.css("h3.r").css("a")}
72
+ result_links.each do |link|
73
+ site_url_save(link)
68
74
  end
69
- end
70
-
71
- # Determines if url is link to search result
72
- def isResultLink?(link)
73
- return (link.include? @op_val) &&
74
- (!link.include? "webcache") &&
75
- (!link.include? @operators.gsub(" ", "+")) &&
76
- (!link.include?("translate.google"))
77
- end
78
75
 
79
- # Determines if URL is link to next search page
80
- def isSearchPageLink?(link)
81
- return (link.include? "&sa=N") && (link.include? "&start=")
76
+ # Go to next page
77
+ next_pages = get_links(page) {|html| html.css("#pnnext")}
78
+ next_pages.each do |link|
79
+ next_search_page("google.com"+link)
80
+ end
82
81
  end
83
-
84
82
 
85
83
  # Parse and save the URLs for search results
86
- def siteURLSave(link)
84
+ def site_url_save(link)
87
85
  @urllist.push(link)
88
86
  end
89
87
 
90
88
  # Process search links and go to next page
91
- def nextSearchPage(link)
89
+ def next_search_page(link)
92
90
  page_index_num = link.split("&start=")[1].split("&sa=N")[0]
93
91
 
94
92
  if page_index_num.to_i == @startindex
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-23 00:00:00.000000000 Z
11
+ date: 2015-12-05 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com
@@ -16,6 +16,7 @@ executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
+ - lib/captcha.rb
19
20
  - lib/generalscraper.rb
20
21
  - lib/parse_page.rb
21
22
  homepage: https://github.com/TransparencyToolkit/generalscraper