hermaeus 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -6
- data/data/config.toml +4 -0
- data/lib/hermaeus.rb +12 -3
- data/lib/hermaeus/client.rb +46 -58
- data/lib/hermaeus/config.rb +52 -13
- data/lib/hermaeus/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ca26f9776acb76bf0097fa0c43b18d1563890084
|
4
|
+
data.tar.gz: fb9835383d6e290d90cf8cfa9191033031e469de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6111c2eee01f5c4122aa1542510aa8610a2a4542a053c470a7aa333be282dd7616c7f40f3121700a2b8ddb997adbf1cd368a0998641b15bd900603247159d4ed
|
7
|
+
data.tar.gz: f81f1158506b2eed59c30a29258cfafefdcf41b3b1f7fdca584a814c4fdf7d87b0d9b81159c80c92f8c9afdc83405f846fdd4cbd0ee97ec82e411be8572453f7
|
data/CHANGELOG.md
CHANGED
@@ -2,18 +2,29 @@
|
|
2
2
|
|
3
3
|
## v1
|
4
4
|
|
5
|
-
### v1.
|
5
|
+
### v1.1
|
6
|
+
|
7
|
+
Move the index page out of a magic string and into the configuration file.
|
8
|
+
|
9
|
+
Update the configuration handler to account for the new information, and provide
|
10
|
+
some more helpful error messages on validation failure.
|
11
|
+
|
12
|
+
Update the Changelog structure slightly.
|
13
|
+
|
14
|
+
### v1.0
|
15
|
+
|
16
|
+
#### v1.0.2
|
6
17
|
|
7
18
|
Fix a File.open call; it was accidentally being given a string permission set
|
8
19
|
instead of numeric.
|
9
20
|
|
10
|
-
|
21
|
+
#### v1.0.1
|
11
22
|
|
12
23
|
Deployment to a separate machine brought some hidden bugs to light. Fixed a
|
13
24
|
syntax error in the initialization routines and added documentation about fresh
|
14
25
|
installations to the README.
|
15
26
|
|
16
|
-
|
27
|
+
#### v1.0.0
|
17
28
|
|
18
29
|
Added a storage backend (`Apocryphon` and `Archivist` classes) capable of
|
19
30
|
formatting the retrieved text and storing it on disk.
|
@@ -21,16 +32,16 @@ formatting the retrieved text and storing it on disk.
|
|
21
32
|
`mora` is confirmed to work in the wild, and so Hermaeus is ready for a 1.0
|
22
33
|
release.
|
23
34
|
|
24
|
-
|
35
|
+
### v0
|
25
36
|
|
26
37
|
Development versions used only for experimentation.
|
27
38
|
|
28
|
-
|
39
|
+
#### v0.2.0
|
29
40
|
|
30
41
|
Completed the ability to retrieve texts from reddit and process them enough for
|
31
42
|
demonstration purposes.
|
32
43
|
|
33
|
-
|
44
|
+
#### v0.1.0
|
34
45
|
|
35
46
|
Initial version -- Gained the ability to connect to reddit and retrieve basic
|
36
47
|
information.
|
data/data/config.toml
CHANGED
data/lib/hermaeus.rb
CHANGED
@@ -18,8 +18,17 @@ module Hermaeus
|
|
18
18
|
def self.init
|
19
19
|
FileUtils.mkdir_p(Config::DIR)
|
20
20
|
if File.exist? Config::FILE
|
21
|
-
|
22
|
-
|
21
|
+
Config.load
|
22
|
+
begin
|
23
|
+
Config.validate!
|
24
|
+
rescue ConfigurationError => e
|
25
|
+
puts <<-EOS
|
26
|
+
#{e.message}
|
27
|
+
|
28
|
+
Edit your configuration file (#{File.join Config::DIR, "config.toml"}) to \
|
29
|
+
continue.
|
30
|
+
EOS
|
31
|
+
end
|
23
32
|
else
|
24
33
|
File.open Config::FILE, "w+" do |file|
|
25
34
|
File.open File.expand_path(Config::SOURCE), "r", 0600 do |cfg|
|
@@ -35,7 +44,7 @@ for Hermaeus to function.
|
|
35
44
|
|
36
45
|
# Public: Connects Hermaeus to reddit.
|
37
46
|
def self.connect
|
38
|
-
@client = Client.new
|
47
|
+
@client = Client.new
|
39
48
|
end
|
40
49
|
|
41
50
|
# Public: Downloads Apocrypha posts.
|
data/lib/hermaeus/client.rb
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
redd
|
5
5
|
].each(&method(:require))
|
6
6
|
|
7
|
+
include Enumerable
|
8
|
+
|
7
9
|
require "hermaeus/config"
|
8
10
|
require "hermaeus/version"
|
9
11
|
|
@@ -13,13 +15,10 @@ module Hermaeus
|
|
13
15
|
class Client
|
14
16
|
USER_AGENT = "Redd/Ruby:Hermaeus:#{Hermaeus::VERSION} (by /u/myrrlyn)"
|
15
17
|
# Public: Connects the Hermaeus::Client to reddit.
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
def initialize client
|
21
|
-
Config.validate client: client
|
22
|
-
@client = Redd.it(client.delete(:type).to_sym, *client.values, user_agent: USER_AGENT)
|
18
|
+
def initialize
|
19
|
+
Config.validate!
|
20
|
+
cfg = Config.info[:client]
|
21
|
+
@client = Redd.it(cfg.delete(:type).to_sym, *cfg.values, user_agent: USER_AGENT)
|
23
22
|
@client.authorize!
|
24
23
|
@html_filter = HTMLEntities.new
|
25
24
|
end
|
@@ -28,7 +27,7 @@ module Hermaeus
|
|
28
27
|
#
|
29
28
|
# Wraps Client#scrape_index; see it for documentation.
|
30
29
|
def get_global_listing **opts
|
31
|
-
scrape_index
|
30
|
+
scrape_index Config.info[:index][:path], opts
|
32
31
|
end
|
33
32
|
|
34
33
|
# Public: Scrapes a Weekly Community Thread patch index.
|
@@ -46,6 +45,8 @@ module Hermaeus
|
|
46
45
|
ids.map! do |id|
|
47
46
|
"t3_#{id}" unless id.match /^t3_/
|
48
47
|
end
|
48
|
+
# TODO: Ensure that this is safe (only query <= 100 IDs at a time), and
|
49
|
+
# call the scraper multiple times and reassemble output if necessary.
|
49
50
|
query = "/by_id/#{ids.join(",")}"
|
50
51
|
scrape_index query, opts
|
51
52
|
end
|
@@ -62,6 +63,7 @@ module Hermaeus
|
|
62
63
|
# Returns a String Array containing the reddit fullnames harvested from the
|
63
64
|
# input list. Input elements that do not match are stripped.
|
64
65
|
def get_fullnames data, **opts
|
66
|
+
# TODO: Move this regex to the configuration file.
|
65
67
|
regex = opts[:regex] || %r(/r/.+/(comments/)?(?<id>[0-9a-z]+)/.+)
|
66
68
|
data.map do |item|
|
67
69
|
m = item.match regex
|
@@ -89,25 +91,11 @@ module Hermaeus
|
|
89
91
|
ret = []
|
90
92
|
# reddit has finite limits on acceptable query sizes. Split the list into
|
91
93
|
# manageable portions
|
92
|
-
fullnames.
|
94
|
+
fullnames.each_slice(100).each do |chunk|
|
93
95
|
# Assemble the list of reddit objects being queried
|
94
|
-
query = chunk.join(",")
|
95
|
-
|
96
|
-
|
97
|
-
if response.success?
|
98
|
-
payload = response.body
|
99
|
-
# The payload should be a Listing even for a single-item query; the
|
100
|
-
# :children array will just have one element.
|
101
|
-
if payload[:kind] == "Listing"
|
102
|
-
payload[:data][:children].each do |item|
|
103
|
-
yield item[:data]
|
104
|
-
end
|
105
|
-
# else
|
106
|
-
end
|
107
|
-
ret << payload
|
108
|
-
end
|
109
|
-
# Keep the rate limiter happy
|
110
|
-
sleep 1
|
96
|
+
query = "/by_id/#{chunk.join(",")}.json"
|
97
|
+
response = scrape_posts query, &block
|
98
|
+
ret << response.body
|
111
99
|
end
|
112
100
|
ret
|
113
101
|
end
|
@@ -175,40 +163,40 @@ module Hermaeus
|
|
175
163
|
end
|
176
164
|
.flatten
|
177
165
|
end
|
178
|
-
end
|
179
|
-
end
|
180
166
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
167
|
+
# Internal: Provides the actual functionality for collecting posts.
|
168
|
+
#
|
169
|
+
# query - The reddit API endpoint or path being queried.
|
170
|
+
# opts - Options for the reddit API call
|
171
|
+
# block - This method yields each post fetched to its block.
|
172
|
+
# tries - hidden parameter used to prevent infinite stalling on rate limits.
|
173
|
+
#
|
174
|
+
# Returns reddit's response to the query.
|
175
|
+
def scrape_posts query, tries = 0, **opts, &block
|
176
|
+
begin
|
177
|
+
# Ask reddit to procure our items
|
178
|
+
response = @client.get(query, opts)
|
179
|
+
if response.success?
|
180
|
+
payload = response.body
|
181
|
+
# The payload should be a Listing even for a single-item query; the
|
182
|
+
# :children array will just have one element.
|
183
|
+
if payload[:kind] == "Listing"
|
184
|
+
payload[:data][:children].each do |item|
|
185
|
+
yield item[:data]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
return response
|
189
|
+
end
|
190
|
+
# If at first you don't succeed...
|
191
|
+
rescue Redd::Error::RateLimited => e
|
192
|
+
sleep e.time + 1
|
193
|
+
# Try try again.
|
194
|
+
if tries < 3
|
195
|
+
scrape_posts query, tries + 1
|
196
|
+
else
|
197
|
+
raise RuntimeError, "reddit rate limit will not unlock"
|
198
|
+
end
|
210
199
|
end
|
211
|
-
ret
|
212
200
|
end
|
213
201
|
end
|
214
202
|
end
|
data/lib/hermaeus/config.rb
CHANGED
@@ -14,28 +14,34 @@ module Hermaeus
|
|
14
14
|
# List of allowed types a reddit client can take
|
15
15
|
ALLOWED_TYPES = %w[script web userless installed]
|
16
16
|
|
17
|
+
# Public: Accessor for the loaded and parsed information.
|
18
|
+
#
|
19
|
+
# Returns nil if the config file has yet to be processed.
|
20
|
+
def self.info
|
21
|
+
@info
|
22
|
+
end
|
23
|
+
|
17
24
|
# Public: Load a configuration file into memory
|
18
25
|
#
|
19
26
|
# Returns the configuration file represented as a Hash with Symbol keys
|
20
27
|
def self.load
|
21
|
-
Tomlrb.load_file FILE, symbolize_keys: true
|
28
|
+
@info = Tomlrb.load_file FILE, symbolize_keys: true
|
22
29
|
end
|
23
30
|
|
24
31
|
# Public: Performs validation checks on a configuration structure
|
25
32
|
#
|
26
|
-
# cfg - A Hash with Symbol keys to check for validity
|
27
|
-
#
|
28
33
|
# Returns true if the configuration argument is valid
|
29
34
|
#
|
30
35
|
# Raises a ConfigurationError if the configuration is invalid, with an
|
31
36
|
# error message describing the failure.
|
32
|
-
def self.validate
|
33
|
-
|
34
|
-
|
37
|
+
def self.validate!
|
38
|
+
# Validate the [client] section.
|
39
|
+
raise ConfigurationError.new(<<-EOS) unless @info.has_key? :client
|
35
40
|
Hermaeus’ configuration file must contain a [client] section.
|
36
|
-
|
37
|
-
|
38
|
-
|
41
|
+
EOS
|
42
|
+
|
43
|
+
# Validate the [client] section’s type field.
|
44
|
+
unless @info[:client].has_key?(:type) && ALLOWED_TYPES.include?(@info[:client][:type])
|
39
45
|
raise ConfigurationError.new <<-EOS
|
40
46
|
Hermaeus’ [client] section must include a type key whose value is one of:
|
41
47
|
#{ALLOWED_TYPES.join(", ")}.
|
@@ -44,7 +50,9 @@ Hermaeus’ [client] section must include a type key whose value is one of:
|
|
44
50
|
type = "one of the listed types"
|
45
51
|
EOS
|
46
52
|
end
|
47
|
-
|
53
|
+
|
54
|
+
# Validate the [client] section’s id and secret fields.
|
55
|
+
unless @info[:client].has_key?(:id) && @info[:client].has_key?(:secret)
|
48
56
|
raise ConfigurationError.new <<-EOS
|
49
57
|
Hermaeus’ [client] section must include keys for the ID and secret provided by
|
50
58
|
reddit for your application.
|
@@ -54,8 +62,10 @@ id = "an ID from reddit"
|
|
54
62
|
secret = "a secret from reddit"
|
55
63
|
EOS
|
56
64
|
end
|
57
|
-
|
58
|
-
|
65
|
+
|
66
|
+
# Validate the [client] section’s username and password fields, if needed.
|
67
|
+
if @info[:client][:type] == "script"
|
68
|
+
client = @info[:client]
|
59
69
|
unless client.has_key?(:username) && client.has_key?(:password)
|
60
70
|
raise ConfigurationError.new <<-EOS
|
61
71
|
When configured for `type = "script"`, Hermaeus’ [client] section must include
|
@@ -67,7 +77,36 @@ password = "hunter2"
|
|
67
77
|
EOS
|
68
78
|
end
|
69
79
|
end
|
70
|
-
|
80
|
+
|
81
|
+
# Validate the [archive] section.
|
82
|
+
raise ConfigurationError.new(<<-EOS) unless @info.has_key? :archive
|
83
|
+
Hermaeus’ configuration file must include an [archive] section to govern the
|
84
|
+
storage of downloaded posts.
|
85
|
+
EOS
|
86
|
+
|
87
|
+
raise ConfigurationError.new(<<-EOS) unless @info[:archive].has_key? :path
|
88
|
+
Hermaeus’ [archive] section must include a path field containing a relative or
|
89
|
+
absolute path in which to store the downloaded posts.
|
90
|
+
|
91
|
+
[archive]
|
92
|
+
path = "./archive"
|
93
|
+
# path = "/tmp/teslore/archive"
|
94
|
+
EOS
|
95
|
+
|
96
|
+
# Validate the [index] section.
|
97
|
+
raise ConfigurationError.new(<<-EOS) unless @info.has_key? :index
|
98
|
+
Hermaeus’ configuration file must include and [index] section to govern the
|
99
|
+
processing of the subreddit’s index page.
|
100
|
+
EOS
|
101
|
+
|
102
|
+
raise ConfigurationError.new(<<-EOS) unless @info[:index].has_key? :path
|
103
|
+
Hermaeus’ [index] section must include a path field containing the reddit page
|
104
|
+
at which the index resides.
|
105
|
+
|
106
|
+
[index]
|
107
|
+
path = "/r/teslore/wiki/archive"
|
108
|
+
EOS
|
109
|
+
true
|
71
110
|
end
|
72
111
|
end
|
73
112
|
end
|
data/lib/hermaeus/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hermaeus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- myrrlyn
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|