generalscraper 0.0.15 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/captcha.rb +56 -0
- data/lib/generalscraper.rb +33 -35
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32261924b8307b97e84ae2ba6c884ac6d3caa93c
|
4
|
+
data.tar.gz: f888607279ef76b7f72fabe9726e602c3bcafad7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 46ef701e1b891cc9d75517d90710c458ef377902454827830633bedfc7c117769255d694a175a20085c43a9b969d2d0375b8442132bca0447cc45613cfebf313
|
7
|
+
data.tar.gz: 4c5d2c254af2d02616c5441b8213b9ebbdf82dd1f82e0b4c91edbca0cf9f16e4b7c732e419fbbe7d0655b08c783b9070c87ce97eb0c83f03ee92f7043a0efceb
|
data/lib/captcha.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rmagick'
|
2
|
+
require 'imgurr'
|
3
|
+
require 'curb'
|
4
|
+
require 'two_captcha'
|
5
|
+
include Magick
|
6
|
+
|
7
|
+
class Captcha
|
8
|
+
def initialize(requests, solver_details)
|
9
|
+
@requests = requests
|
10
|
+
@captcha_key = solver_details[:captcha_key]
|
11
|
+
end
|
12
|
+
|
13
|
+
# Solves the captcha
|
14
|
+
def solve
|
15
|
+
take_screenshot
|
16
|
+
crop_screenshot
|
17
|
+
@captcha_solution = get_captcha_solved
|
18
|
+
submit_captcha_solution
|
19
|
+
delete_screenshots
|
20
|
+
end
|
21
|
+
|
22
|
+
# Have the captcha solved
|
23
|
+
def get_captcha_solved
|
24
|
+
client = TwoCaptcha.new(@captcha_key)
|
25
|
+
captcha = client.decode!(file: File.open(@time_name+"_cropped.png"))
|
26
|
+
return captcha.text
|
27
|
+
end
|
28
|
+
|
29
|
+
# Submit the captcha solution
|
30
|
+
def submit_captcha_solution
|
31
|
+
browser = @requests.get_most_recent_browser[1][0]
|
32
|
+
element = browser.find_element(id: "captcha")
|
33
|
+
element.send_keys(@captcha_solution)
|
34
|
+
element.submit
|
35
|
+
end
|
36
|
+
|
37
|
+
# Takes a screenshot of captcha in browser
|
38
|
+
def take_screenshot
|
39
|
+
@time_name = Time.now.to_s.gsub(" ", "").gsub("-", "").gsub(":", "").gsub("-", "")
|
40
|
+
@requests.get_most_recent_browser[1][0].save_screenshot(@time_name+".png")
|
41
|
+
end
|
42
|
+
|
43
|
+
# Crops the screenshot to be mostly just the CAPTCHA
|
44
|
+
def crop_screenshot
|
45
|
+
captcha_image = Image.read(@time_name+".png").first
|
46
|
+
width = captcha_image.columns
|
47
|
+
height = captcha_image.rows
|
48
|
+
cropped_captcha = captcha_image.crop(0, 0, width, height/2)
|
49
|
+
cropped_captcha.write(@time_name+"_cropped.png")
|
50
|
+
end
|
51
|
+
|
52
|
+
# Deletes the screenshot images
|
53
|
+
def delete_screenshots
|
54
|
+
File.delete(@time_name+".png", @time_name+"_cropped.png")
|
55
|
+
end
|
56
|
+
end
|
data/lib/generalscraper.rb
CHANGED
@@ -5,15 +5,17 @@ require 'requestmanager'
|
|
5
5
|
require 'pry'
|
6
6
|
|
7
7
|
load 'parse_page.rb'
|
8
|
+
load 'captcha.rb'
|
8
9
|
|
9
10
|
class GeneralScraper
|
10
11
|
include ParsePage
|
11
12
|
|
12
|
-
def initialize(operators, searchterm, requests)
|
13
|
+
def initialize(operators, searchterm, requests, solver_details)
|
13
14
|
@operators = operators
|
14
15
|
@searchterm = searchterm
|
15
16
|
@op_val = @operators.split(" ")[0].split(":")[1]
|
16
17
|
@requests = requests
|
18
|
+
@solver_details = solver_details
|
17
19
|
|
18
20
|
@output = Array.new
|
19
21
|
@urllist = Array.new
|
@@ -29,19 +31,30 @@ class GeneralScraper
|
|
29
31
|
# Check that page with links loaded
|
30
32
|
def check_results(page, *requested_page)
|
31
33
|
if page.include?("To continue, please type the characters below:")
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
34
|
+
# Solve CAPTCHA if enabled
|
35
|
+
if @solver_details
|
36
|
+
c = Captcha.new(@requests, @solver_details)
|
37
|
+
c.solve
|
38
|
+
|
39
|
+
# Proceed as normal
|
40
|
+
sleep(1)
|
41
|
+
check_results(@requests.get_updated_current_page)
|
42
|
+
|
43
|
+
else # Restart and try again if CAPTCHA-solving not enabled
|
44
|
+
@requests.restart_browser
|
45
|
+
check_results(@requests.get_page(requested_page), requested_page)
|
46
|
+
end
|
47
|
+
else # No CAPTCHA found :)
|
48
|
+
navigate_save_results(page)
|
36
49
|
end
|
37
50
|
end
|
38
51
|
|
39
|
-
# Gets the links from the page
|
40
|
-
def
|
52
|
+
# Gets the links from the page that match css selector in block
|
53
|
+
def get_links(page, &block)
|
41
54
|
html = Nokogiri::HTML(page)
|
42
55
|
|
43
56
|
# Get array of links
|
44
|
-
return html
|
57
|
+
return yield(html).inject(Array.new) do |link_arr, al|
|
45
58
|
begin
|
46
59
|
link_arr.push(al["href"])
|
47
60
|
rescue
|
@@ -53,42 +66,27 @@ class GeneralScraper
|
|
53
66
|
end
|
54
67
|
|
55
68
|
# Categorizes the links on results page into results and other search pages
|
56
|
-
def
|
57
|
-
links
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
if link
|
62
|
-
if isResultLink?(link)
|
63
|
-
siteURLSave(link)
|
64
|
-
elsif isSearchPageLink?(link)
|
65
|
-
nextSearchPage("google.com"+link)
|
66
|
-
end
|
67
|
-
end
|
69
|
+
def navigate_save_results(page)
|
70
|
+
# Save result links for page
|
71
|
+
result_links = get_links(page) {|html| html.css("h3.r").css("a")}
|
72
|
+
result_links.each do |link|
|
73
|
+
site_url_save(link)
|
68
74
|
end
|
69
|
-
end
|
70
|
-
|
71
|
-
# Determines if url is link to search result
|
72
|
-
def isResultLink?(link)
|
73
|
-
return (link.include? @op_val) &&
|
74
|
-
(!link.include? "webcache") &&
|
75
|
-
(!link.include? @operators.gsub(" ", "+")) &&
|
76
|
-
(!link.include?("translate.google"))
|
77
|
-
end
|
78
75
|
|
79
|
-
|
80
|
-
|
81
|
-
|
76
|
+
# Go to next page
|
77
|
+
next_pages = get_links(page) {|html| html.css("#pnnext")}
|
78
|
+
next_pages.each do |link|
|
79
|
+
next_search_page("google.com"+link)
|
80
|
+
end
|
82
81
|
end
|
83
|
-
|
84
82
|
|
85
83
|
# Parse and save the URLs for search results
|
86
|
-
def
|
84
|
+
def site_url_save(link)
|
87
85
|
@urllist.push(link)
|
88
86
|
end
|
89
87
|
|
90
88
|
# Process search links and go to next page
|
91
|
-
def
|
89
|
+
def next_search_page(link)
|
92
90
|
page_index_num = link.split("&start=")[1].split("&sa=N")[0]
|
93
91
|
|
94
92
|
if page_index_num.to_i == @startindex
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-05 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
@@ -16,6 +16,7 @@ executables: []
|
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
|
+
- lib/captcha.rb
|
19
20
|
- lib/generalscraper.rb
|
20
21
|
- lib/parse_page.rb
|
21
22
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|