strigil 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/strigil.rb +33 -1
- data/lib/strigil/comments_parser.rb +20 -9
- data/lib/strigil/strigil_client.rb +51 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14e5b5afe824abec6ddcae00f102eb785b55c033
|
4
|
+
data.tar.gz: 2674268125891b755c502b7d0321e352c99c66a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2107c4baebc045cf2034bc18aec704065dae8677a880db227cf227d3fc934ba04f2ef035adddee0a1d5cba9b163df83ac391b752135dc261cc4a64d7705f78ca
|
7
|
+
data.tar.gz: '025218138f22d2332c3e89ef1a59c511759a49b3ecabb636e8f49cb650a0e9e342e835b8751beaa0c9422c6ec12ffdd53c779de8a613ed29b3a8f04dc11d993a'
|
data/lib/strigil.rb
CHANGED
@@ -1,8 +1,19 @@
|
|
1
1
|
class Strigil
|
2
2
|
def self.engage(user)
|
3
|
-
|
3
|
+
client_config
|
4
|
+
|
5
|
+
puts "Initializing client..."
|
6
|
+
client = Strigil::StrigilClient.new(
|
7
|
+
username: @username,
|
8
|
+
password: @password,
|
9
|
+
browser: @browser,
|
10
|
+
target: user
|
11
|
+
)
|
12
|
+
|
13
|
+
puts "Initializing comments store..."
|
4
14
|
comments = Strigil::Comments.new
|
5
15
|
|
16
|
+
puts "Beginning scrape. This may take a while."
|
6
17
|
processing = true
|
7
18
|
while processing == true
|
8
19
|
comments.add_comments(client.pull_comments)
|
@@ -22,6 +33,27 @@ class Strigil
|
|
22
33
|
end
|
23
34
|
end
|
24
35
|
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def self.client_config
|
40
|
+
puts "Valid Reddit account details are neccessary to scrape correctly."
|
41
|
+
puts "This information is _not_ sent to any third party - it is simply"
|
42
|
+
puts "used to properly configure how user comments are displayed in"
|
43
|
+
puts "order to scrape them correctly. Feel free to use a throwaway."
|
44
|
+
puts "Username:"
|
45
|
+
print "> "
|
46
|
+
@username = gets.chomp
|
47
|
+
puts "Password:"
|
48
|
+
print "> "
|
49
|
+
@password = gets.chomp
|
50
|
+
puts "Do you have Chrome or Firefox installed?"
|
51
|
+
puts "Type either 'chrome' or 'firefox' without quotes."
|
52
|
+
@browser = gets.chomp
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class ConfigurationError < StandardError
|
25
57
|
end
|
26
58
|
|
27
59
|
require 'strigil/comment'
|
@@ -1,10 +1,12 @@
|
|
1
1
|
class Strigil::CommentsParser
|
2
2
|
|
3
|
-
def self.parse(
|
3
|
+
def self.parse(client)
|
4
|
+
comments = get_comments(client)
|
5
|
+
|
4
6
|
comments.map do |comment|
|
5
7
|
Strigil::Comment.new(
|
6
|
-
author: comment
|
7
|
-
subreddit: comment
|
8
|
+
author: get_author(comment),
|
9
|
+
subreddit: get_subreddit(comment),
|
8
10
|
permalink: get_permalink(comment),
|
9
11
|
timestamp: get_timestamp(comment),
|
10
12
|
text: get_text(comment)
|
@@ -14,19 +16,28 @@ class Strigil::CommentsParser
|
|
14
16
|
|
15
17
|
private
|
16
18
|
|
19
|
+
def self.get_comments(client)
|
20
|
+
client.find_elements(class: "comment")
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.get_author(comment)
|
24
|
+
comment.attribute("data-author")
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.get_subreddit(comment)
|
28
|
+
comment.attribute("data-subreddit")
|
29
|
+
end
|
30
|
+
|
17
31
|
def self.get_permalink(comment)
|
18
|
-
|
19
|
-
entry.find_element(class: "bylink").attribute(:href)
|
32
|
+
'https://reddit.com/r/' + comment.attribute("data-permalink")
|
20
33
|
end
|
21
34
|
|
22
35
|
def self.get_timestamp(comment)
|
23
|
-
|
24
|
-
entry.find_element(tag_name: "time").attribute(:title)
|
36
|
+
get_entry(comment).find_element(tag_name: "time").attribute(:title)
|
25
37
|
end
|
26
38
|
|
27
39
|
def self.get_text(comment)
|
28
|
-
|
29
|
-
entry.find_element(class: "usertext-body").text
|
40
|
+
get_entry(comment).find_element(class: "usertext-body").text
|
30
41
|
end
|
31
42
|
|
32
43
|
def self.get_entry(comment)
|
@@ -4,14 +4,20 @@ require 'webdrivers'
|
|
4
4
|
class Strigil::StrigilClient
|
5
5
|
attr_reader :client
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
|
9
|
-
|
7
|
+
def initialize(params)
|
8
|
+
target = params[:target]
|
9
|
+
username = params[:username]
|
10
|
+
password = params[:password]
|
11
|
+
|
12
|
+
@client = Selenium::WebDriver.for(params[:browser].to_sym)
|
13
|
+
|
14
|
+
login_and_make_legacy(username, password)
|
15
|
+
|
16
|
+
client.navigate.to "https://reddit.com/u/#{target}/comments"
|
10
17
|
end
|
11
18
|
|
12
19
|
def pull_comments
|
13
|
-
|
14
|
-
Strigil::CommentsParser.parse(raw_comments)
|
20
|
+
Strigil::CommentsParser.parse(client)
|
15
21
|
end
|
16
22
|
|
17
23
|
def close
|
@@ -26,6 +32,46 @@ class Strigil::StrigilClient
|
|
26
32
|
end
|
27
33
|
end
|
28
34
|
|
35
|
+
private
|
36
|
+
|
37
|
+
def login_and_make_legacy(username, password)
|
38
|
+
login_to_reddit(username, password)
|
39
|
+
set_legacy_preference
|
40
|
+
end
|
41
|
+
|
42
|
+
def login_to_reddit(username, password)
|
43
|
+
client.navigate.to 'https://reddit.com/login'
|
44
|
+
|
45
|
+
login_panel = client.find_element(id: "login-form")
|
46
|
+
|
47
|
+
username_field = login_panel.find_element(id: "user_login")
|
48
|
+
password_field = login_panel.find_element(id: "passwd_login")
|
49
|
+
login_button = login_panel.find_element(tag_name: "button", class: "c-btn")
|
50
|
+
|
51
|
+
username_field.send_keys(username)
|
52
|
+
password_field.send_keys(password)
|
53
|
+
|
54
|
+
login_button.click
|
55
|
+
|
56
|
+
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
57
|
+
wait.until do
|
58
|
+
client.find_element(id: "header-bottom-right").find_element(link_text: username)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def set_legacy_preference
|
63
|
+
client.navigate.to 'https://reddit.com/prefs/'
|
64
|
+
|
65
|
+
legacy_box = client.find_element(id: "profile_opt_out")
|
66
|
+
|
67
|
+
unless legacy_box.attribute(:checked)
|
68
|
+
legacy_box.click
|
69
|
+
|
70
|
+
save_button = client.find_element(class: "save-preferences")
|
71
|
+
save_button.click
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
29
75
|
end
|
30
76
|
|
31
77
|
class EndOfQueueError < StandardError
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: strigil
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Harry Stebbins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-04-
|
11
|
+
date: 2018-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: selenium-webdriver
|