strigil 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/strigil.rb +33 -1
- data/lib/strigil/comments_parser.rb +20 -9
- data/lib/strigil/strigil_client.rb +51 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14e5b5afe824abec6ddcae00f102eb785b55c033
|
4
|
+
data.tar.gz: 2674268125891b755c502b7d0321e352c99c66a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2107c4baebc045cf2034bc18aec704065dae8677a880db227cf227d3fc934ba04f2ef035adddee0a1d5cba9b163df83ac391b752135dc261cc4a64d7705f78ca
|
7
|
+
data.tar.gz: '025218138f22d2332c3e89ef1a59c511759a49b3ecabb636e8f49cb650a0e9e342e835b8751beaa0c9422c6ec12ffdd53c779de8a613ed29b3a8f04dc11d993a'
|
data/lib/strigil.rb
CHANGED
@@ -1,8 +1,19 @@
|
|
1
1
|
class Strigil
|
2
2
|
def self.engage(user)
|
3
|
-
|
3
|
+
client_config
|
4
|
+
|
5
|
+
puts "Initializing client..."
|
6
|
+
client = Strigil::StrigilClient.new(
|
7
|
+
username: @username,
|
8
|
+
password: @password,
|
9
|
+
browser: @browser,
|
10
|
+
target: user
|
11
|
+
)
|
12
|
+
|
13
|
+
puts "Initializing comments store..."
|
4
14
|
comments = Strigil::Comments.new
|
5
15
|
|
16
|
+
puts "Beginning scrape. This may take a while."
|
6
17
|
processing = true
|
7
18
|
while processing == true
|
8
19
|
comments.add_comments(client.pull_comments)
|
@@ -22,6 +33,27 @@ class Strigil
|
|
22
33
|
end
|
23
34
|
end
|
24
35
|
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def self.client_config
|
40
|
+
puts "Valid Reddit account details are neccessary to scrape correctly."
|
41
|
+
puts "This information is _not_ sent to any third party - it is simply"
|
42
|
+
puts "used to properly configure how user comments are displayed in"
|
43
|
+
puts "order to scrape them correctly. Feel free to use a throwaway."
|
44
|
+
puts "Username:"
|
45
|
+
print "> "
|
46
|
+
@username = gets.chomp
|
47
|
+
puts "Password:"
|
48
|
+
print "> "
|
49
|
+
@password = gets.chomp
|
50
|
+
puts "Do you have Chrome or Firefox installed?"
|
51
|
+
puts "Type either 'chrome' or 'firefox' without quotes."
|
52
|
+
@browser = gets.chomp
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class ConfigurationError < StandardError
|
25
57
|
end
|
26
58
|
|
27
59
|
require 'strigil/comment'
|
@@ -1,10 +1,12 @@
|
|
1
1
|
class Strigil::CommentsParser
|
2
2
|
|
3
|
-
def self.parse(
|
3
|
+
def self.parse(client)
|
4
|
+
comments = get_comments(client)
|
5
|
+
|
4
6
|
comments.map do |comment|
|
5
7
|
Strigil::Comment.new(
|
6
|
-
author: comment
|
7
|
-
subreddit: comment
|
8
|
+
author: get_author(comment),
|
9
|
+
subreddit: get_subreddit(comment),
|
8
10
|
permalink: get_permalink(comment),
|
9
11
|
timestamp: get_timestamp(comment),
|
10
12
|
text: get_text(comment)
|
@@ -14,19 +16,28 @@ class Strigil::CommentsParser
|
|
14
16
|
|
15
17
|
private
|
16
18
|
|
19
|
+
def self.get_comments(client)
|
20
|
+
client.find_elements(class: "comment")
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.get_author(comment)
|
24
|
+
comment.attribute("data-author")
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.get_subreddit(comment)
|
28
|
+
comment.attribute("data-subreddit")
|
29
|
+
end
|
30
|
+
|
17
31
|
def self.get_permalink(comment)
|
18
|
-
|
19
|
-
entry.find_element(class: "bylink").attribute(:href)
|
32
|
+
'https://reddit.com/r/' + comment.attribute("data-permalink")
|
20
33
|
end
|
21
34
|
|
22
35
|
def self.get_timestamp(comment)
|
23
|
-
|
24
|
-
entry.find_element(tag_name: "time").attribute(:title)
|
36
|
+
get_entry(comment).find_element(tag_name: "time").attribute(:title)
|
25
37
|
end
|
26
38
|
|
27
39
|
def self.get_text(comment)
|
28
|
-
|
29
|
-
entry.find_element(class: "usertext-body").text
|
40
|
+
get_entry(comment).find_element(class: "usertext-body").text
|
30
41
|
end
|
31
42
|
|
32
43
|
def self.get_entry(comment)
|
@@ -4,14 +4,20 @@ require 'webdrivers'
|
|
4
4
|
class Strigil::StrigilClient
|
5
5
|
attr_reader :client
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
|
9
|
-
|
7
|
+
def initialize(params)
|
8
|
+
target = params[:target]
|
9
|
+
username = params[:username]
|
10
|
+
password = params[:password]
|
11
|
+
|
12
|
+
@client = Selenium::WebDriver.for(params[:browser].to_sym)
|
13
|
+
|
14
|
+
login_and_make_legacy(username, password)
|
15
|
+
|
16
|
+
client.navigate.to "https://reddit.com/u/#{target}/comments"
|
10
17
|
end
|
11
18
|
|
12
19
|
def pull_comments
|
13
|
-
|
14
|
-
Strigil::CommentsParser.parse(raw_comments)
|
20
|
+
Strigil::CommentsParser.parse(client)
|
15
21
|
end
|
16
22
|
|
17
23
|
def close
|
@@ -26,6 +32,46 @@ class Strigil::StrigilClient
|
|
26
32
|
end
|
27
33
|
end
|
28
34
|
|
35
|
+
private
|
36
|
+
|
37
|
+
def login_and_make_legacy(username, password)
|
38
|
+
login_to_reddit(username, password)
|
39
|
+
set_legacy_preference
|
40
|
+
end
|
41
|
+
|
42
|
+
def login_to_reddit(username, password)
|
43
|
+
client.navigate.to 'https://reddit.com/login'
|
44
|
+
|
45
|
+
login_panel = client.find_element(id: "login-form")
|
46
|
+
|
47
|
+
username_field = login_panel.find_element(id: "user_login")
|
48
|
+
password_field = login_panel.find_element(id: "passwd_login")
|
49
|
+
login_button = login_panel.find_element(tag_name: "button", class: "c-btn")
|
50
|
+
|
51
|
+
username_field.send_keys(username)
|
52
|
+
password_field.send_keys(password)
|
53
|
+
|
54
|
+
login_button.click
|
55
|
+
|
56
|
+
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
57
|
+
wait.until do
|
58
|
+
client.find_element(id: "header-bottom-right").find_element(link_text: username)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def set_legacy_preference
|
63
|
+
client.navigate.to 'https://reddit.com/prefs/'
|
64
|
+
|
65
|
+
legacy_box = client.find_element(id: "profile_opt_out")
|
66
|
+
|
67
|
+
unless legacy_box.attribute(:checked)
|
68
|
+
legacy_box.click
|
69
|
+
|
70
|
+
save_button = client.find_element(class: "save-preferences")
|
71
|
+
save_button.click
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
29
75
|
end
|
30
76
|
|
31
77
|
class EndOfQueueError < StandardError
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: strigil
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Harry Stebbins
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-04-
|
11
|
+
date: 2018-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: selenium-webdriver
|