generalscraper 0.0.26 → 0.0.27
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +1 -0
- data/lib/translate_page.rb +75 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fe81408deb27c96a9a67231be5953c8ddf11b23
|
4
|
+
data.tar.gz: 1297c834ccdae5daf1ef1b8664f555f6481995f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5ae3df4ea3ea229dc7e89e55bb4e56f86e6d2bc93b38e064d38bb772b1a02fad6fe1ec58698e88c97714c269ca4c9bef5a95d8522c2edf4443141d71f6c9fae
|
7
|
+
data.tar.gz: e63d78332f8decb130f1968939752d8464a5eb6be4d5da151403b3b7687e426128e3670f0ee37c64472b5371b1d5d971bd01b9dfaeb1b86409b0d994734d5be0
|
data/lib/generalscraper.rb
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'requestmanager'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class TranslatePage
|
6
|
+
def initialize(urls, requests)
|
7
|
+
@urls = urls
|
8
|
+
@requests = requests
|
9
|
+
@output = Array.new
|
10
|
+
end
|
11
|
+
|
12
|
+
# Setup browser for translate
|
13
|
+
def setup_browser
|
14
|
+
@requests.get_page("https://translate.google.com")
|
15
|
+
return @requests.get_most_recent_browser[1][0]
|
16
|
+
end
|
17
|
+
|
18
|
+
# First request
|
19
|
+
def first_request(url, browser)
|
20
|
+
# Enter URL into translate form
|
21
|
+
translate_form = browser.find_element(id: "source")
|
22
|
+
translate_form.send_keys(url)
|
23
|
+
|
24
|
+
# Click the button to translate to a particular language
|
25
|
+
click_button = browser.find_elements(:xpath, "//*[@value='es']").last
|
26
|
+
click_button.click
|
27
|
+
|
28
|
+
# Press Translate button, then switch back to orginal
|
29
|
+
browser.find_element(id: "gt-submit").click
|
30
|
+
end
|
31
|
+
|
32
|
+
# Next request
|
33
|
+
def nth_request(url, browser)
|
34
|
+
browser.switch_to.default_content
|
35
|
+
form_element = browser.find_element(name: "q")
|
36
|
+
form_element.clear
|
37
|
+
form_element.send_keys(url)
|
38
|
+
form_element.submit
|
39
|
+
end
|
40
|
+
|
41
|
+
# Translate the pages
|
42
|
+
def translate
|
43
|
+
browser = setup_browser
|
44
|
+
|
45
|
+
# Go through each link
|
46
|
+
counter = 0
|
47
|
+
@urls.each do |url|
|
48
|
+
# Run translate on each page
|
49
|
+
if counter == 0
|
50
|
+
first_request(url, browser)
|
51
|
+
counter+=1
|
52
|
+
else
|
53
|
+
nth_request(url, browser)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Get html
|
57
|
+
@output.push({url: url, html: get_iframe_html(browser)})
|
58
|
+
end
|
59
|
+
|
60
|
+
# Clean up
|
61
|
+
@requests.close_all_browsers
|
62
|
+
return @output
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get iframe
|
66
|
+
def get_iframe_html(browser)
|
67
|
+
sleep(3)
|
68
|
+
browser.find_element(id: "anno2").click
|
69
|
+
|
70
|
+
# Get HTML inside the iframe
|
71
|
+
browser.switch_to.frame(0)
|
72
|
+
iframe_html = browser.find_element(class: "os-linux").attribute("innerHTML")
|
73
|
+
return iframe_html
|
74
|
+
end
|
75
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.27
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
@@ -19,6 +19,7 @@ files:
|
|
19
19
|
- lib/captcha.rb
|
20
20
|
- lib/generalscraper.rb
|
21
21
|
- lib/parse_page.rb
|
22
|
+
- lib/translate_page.rb
|
22
23
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|
23
24
|
licenses:
|
24
25
|
- GPL
|