generalscraper 0.0.26 → 0.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +1 -0
- data/lib/translate_page.rb +75 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fe81408deb27c96a9a67231be5953c8ddf11b23
|
4
|
+
data.tar.gz: 1297c834ccdae5daf1ef1b8664f555f6481995f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5ae3df4ea3ea229dc7e89e55bb4e56f86e6d2bc93b38e064d38bb772b1a02fad6fe1ec58698e88c97714c269ca4c9bef5a95d8522c2edf4443141d71f6c9fae
|
7
|
+
data.tar.gz: e63d78332f8decb130f1968939752d8464a5eb6be4d5da151403b3b7687e426128e3670f0ee37c64472b5371b1d5d971bd01b9dfaeb1b86409b0d994734d5be0
|
data/lib/generalscraper.rb
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'requestmanager'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class TranslatePage
|
6
|
+
def initialize(urls, requests)
|
7
|
+
@urls = urls
|
8
|
+
@requests = requests
|
9
|
+
@output = Array.new
|
10
|
+
end
|
11
|
+
|
12
|
+
# Setup browser for translate
|
13
|
+
def setup_browser
|
14
|
+
@requests.get_page("https://translate.google.com")
|
15
|
+
return @requests.get_most_recent_browser[1][0]
|
16
|
+
end
|
17
|
+
|
18
|
+
# First request
|
19
|
+
def first_request(url, browser)
|
20
|
+
# Enter URL into translate form
|
21
|
+
translate_form = browser.find_element(id: "source")
|
22
|
+
translate_form.send_keys(url)
|
23
|
+
|
24
|
+
# Click the button to translate to a particular language
|
25
|
+
click_button = browser.find_elements(:xpath, "//*[@value='es']").last
|
26
|
+
click_button.click
|
27
|
+
|
28
|
+
# Press Translate button, then switch back to orginal
|
29
|
+
browser.find_element(id: "gt-submit").click
|
30
|
+
end
|
31
|
+
|
32
|
+
# Next request
|
33
|
+
def nth_request(url, browser)
|
34
|
+
browser.switch_to.default_content
|
35
|
+
form_element = browser.find_element(name: "q")
|
36
|
+
form_element.clear
|
37
|
+
form_element.send_keys(url)
|
38
|
+
form_element.submit
|
39
|
+
end
|
40
|
+
|
41
|
+
# Translate the pages
|
42
|
+
def translate
|
43
|
+
browser = setup_browser
|
44
|
+
|
45
|
+
# Go through each link
|
46
|
+
counter = 0
|
47
|
+
@urls.each do |url|
|
48
|
+
# Run translate on each page
|
49
|
+
if counter == 0
|
50
|
+
first_request(url, browser)
|
51
|
+
counter+=1
|
52
|
+
else
|
53
|
+
nth_request(url, browser)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Get html
|
57
|
+
@output.push({url: url, html: get_iframe_html(browser)})
|
58
|
+
end
|
59
|
+
|
60
|
+
# Clean up
|
61
|
+
@requests.close_all_browsers
|
62
|
+
return @output
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get iframe
|
66
|
+
def get_iframe_html(browser)
|
67
|
+
sleep(3)
|
68
|
+
browser.find_element(id: "anno2").click
|
69
|
+
|
70
|
+
# Get HTML inside the iframe
|
71
|
+
browser.switch_to.frame(0)
|
72
|
+
iframe_html = browser.find_element(class: "os-linux").attribute("innerHTML")
|
73
|
+
return iframe_html
|
74
|
+
end
|
75
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.27
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|
@@ -19,6 +19,7 @@ files:
|
|
19
19
|
- lib/captcha.rb
|
20
20
|
- lib/generalscraper.rb
|
21
21
|
- lib/parse_page.rb
|
22
|
+
- lib/translate_page.rb
|
22
23
|
homepage: https://github.com/TransparencyToolkit/generalscraper
|
23
24
|
licenses:
|
24
25
|
- GPL
|