hermaeus 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -6
- data/data/config.toml +4 -0
- data/lib/hermaeus.rb +12 -3
- data/lib/hermaeus/client.rb +46 -58
- data/lib/hermaeus/config.rb +52 -13
- data/lib/hermaeus/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ca26f9776acb76bf0097fa0c43b18d1563890084
|
4
|
+
data.tar.gz: fb9835383d6e290d90cf8cfa9191033031e469de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6111c2eee01f5c4122aa1542510aa8610a2a4542a053c470a7aa333be282dd7616c7f40f3121700a2b8ddb997adbf1cd368a0998641b15bd900603247159d4ed
|
7
|
+
data.tar.gz: f81f1158506b2eed59c30a29258cfafefdcf41b3b1f7fdca584a814c4fdf7d87b0d9b81159c80c92f8c9afdc83405f846fdd4cbd0ee97ec82e411be8572453f7
|
data/CHANGELOG.md
CHANGED
@@ -2,18 +2,29 @@
|
|
2
2
|
|
3
3
|
## v1
|
4
4
|
|
5
|
-
### v1.
|
5
|
+
### v1.1
|
6
|
+
|
7
|
+
Move the index page out of a magic string and into the configuration file.
|
8
|
+
|
9
|
+
Update the configuration handler to account for the new information, and provide
|
10
|
+
some more helpful error messages on validation failure.
|
11
|
+
|
12
|
+
Update the Changelog structure slightly.
|
13
|
+
|
14
|
+
### v1.0
|
15
|
+
|
16
|
+
#### v1.0.2
|
6
17
|
|
7
18
|
Fix a File.open call; it was accidentally being given a string permission set
|
8
19
|
instead of numeric.
|
9
20
|
|
10
|
-
|
21
|
+
#### v1.0.1
|
11
22
|
|
12
23
|
Deployment to a separate machine brought some hidden bugs to light. Fixed a
|
13
24
|
syntax error in the initialization routines and added documentation about fresh
|
14
25
|
installations to the README.
|
15
26
|
|
16
|
-
|
27
|
+
#### v1.0.0
|
17
28
|
|
18
29
|
Added a storage backend (`Apocryphon` and `Archivist` classes) capable of
|
19
30
|
formatting the retrieved text and storing it on disk.
|
@@ -21,16 +32,16 @@ formatting the retrieved text and storing it on disk.
|
|
21
32
|
`mora` is confirmed to work in the wild, and so Hermaeus is ready for a 1.0
|
22
33
|
release.
|
23
34
|
|
24
|
-
|
35
|
+
### v0
|
25
36
|
|
26
37
|
Development versions used only for experimentation.
|
27
38
|
|
28
|
-
|
39
|
+
#### v0.2.0
|
29
40
|
|
30
41
|
Completed the ability to retrieve texts from reddit and process them enough for
|
31
42
|
demonstration purposes.
|
32
43
|
|
33
|
-
|
44
|
+
#### v0.1.0
|
34
45
|
|
35
46
|
Initial version -- Gained the ability to connect to reddit and retrieve basic
|
36
47
|
information.
|
data/data/config.toml
CHANGED
data/lib/hermaeus.rb
CHANGED
@@ -18,8 +18,17 @@ module Hermaeus
|
|
18
18
|
def self.init
|
19
19
|
FileUtils.mkdir_p(Config::DIR)
|
20
20
|
if File.exist? Config::FILE
|
21
|
-
|
22
|
-
|
21
|
+
Config.load
|
22
|
+
begin
|
23
|
+
Config.validate!
|
24
|
+
rescue ConfigurationError => e
|
25
|
+
puts <<-EOS
|
26
|
+
#{e.message}
|
27
|
+
|
28
|
+
Edit your configuration file (#{File.join Config::DIR, "config.toml"}) to \
|
29
|
+
continue.
|
30
|
+
EOS
|
31
|
+
end
|
23
32
|
else
|
24
33
|
File.open Config::FILE, "w+" do |file|
|
25
34
|
File.open File.expand_path(Config::SOURCE), "r", 0600 do |cfg|
|
@@ -35,7 +44,7 @@ for Hermaeus to function.
|
|
35
44
|
|
36
45
|
# Public: Connects Hermaeus to reddit.
|
37
46
|
def self.connect
|
38
|
-
@client = Client.new
|
47
|
+
@client = Client.new
|
39
48
|
end
|
40
49
|
|
41
50
|
# Public: Downloads Apocrypha posts.
|
data/lib/hermaeus/client.rb
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
redd
|
5
5
|
].each(&method(:require))
|
6
6
|
|
7
|
+
include Enumerable
|
8
|
+
|
7
9
|
require "hermaeus/config"
|
8
10
|
require "hermaeus/version"
|
9
11
|
|
@@ -13,13 +15,10 @@ module Hermaeus
|
|
13
15
|
class Client
|
14
16
|
USER_AGENT = "Redd/Ruby:Hermaeus:#{Hermaeus::VERSION} (by /u/myrrlyn)"
|
15
17
|
# Public: Connects the Hermaeus::Client to reddit.
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
def initialize client
|
21
|
-
Config.validate client: client
|
22
|
-
@client = Redd.it(client.delete(:type).to_sym, *client.values, user_agent: USER_AGENT)
|
18
|
+
def initialize
|
19
|
+
Config.validate!
|
20
|
+
cfg = Config.info[:client]
|
21
|
+
@client = Redd.it(cfg.delete(:type).to_sym, *cfg.values, user_agent: USER_AGENT)
|
23
22
|
@client.authorize!
|
24
23
|
@html_filter = HTMLEntities.new
|
25
24
|
end
|
@@ -28,7 +27,7 @@ module Hermaeus
|
|
28
27
|
#
|
29
28
|
# Wraps Client#scrape_index; see it for documentation.
|
30
29
|
def get_global_listing **opts
|
31
|
-
scrape_index
|
30
|
+
scrape_index Config.info[:index][:path], opts
|
32
31
|
end
|
33
32
|
|
34
33
|
# Public: Scrapes a Weekly Community Thread patch index.
|
@@ -46,6 +45,8 @@ module Hermaeus
|
|
46
45
|
ids.map! do |id|
|
47
46
|
"t3_#{id}" unless id.match /^t3_/
|
48
47
|
end
|
48
|
+
# TODO: Ensure that this is safe (only query <= 100 IDs at a time), and
|
49
|
+
# call the scraper multiple times and reassemble output if necessary.
|
49
50
|
query = "/by_id/#{ids.join(",")}"
|
50
51
|
scrape_index query, opts
|
51
52
|
end
|
@@ -62,6 +63,7 @@ module Hermaeus
|
|
62
63
|
# Returns a String Array containing the reddit fullnames harvested from the
|
63
64
|
# input list. Input elements that do not match are stripped.
|
64
65
|
def get_fullnames data, **opts
|
66
|
+
# TODO: Move this regex to the configuration file.
|
65
67
|
regex = opts[:regex] || %r(/r/.+/(comments/)?(?<id>[0-9a-z]+)/.+)
|
66
68
|
data.map do |item|
|
67
69
|
m = item.match regex
|
@@ -89,25 +91,11 @@ module Hermaeus
|
|
89
91
|
ret = []
|
90
92
|
# reddit has finite limits on acceptable query sizes. Split the list into
|
91
93
|
# manageable portions
|
92
|
-
fullnames.
|
94
|
+
fullnames.each_slice(100).each do |chunk|
|
93
95
|
# Assemble the list of reddit objects being queried
|
94
|
-
query = chunk.join(",")
|
95
|
-
|
96
|
-
|
97
|
-
if response.success?
|
98
|
-
payload = response.body
|
99
|
-
# The payload should be a Listing even for a single-item query; the
|
100
|
-
# :children array will just have one element.
|
101
|
-
if payload[:kind] == "Listing"
|
102
|
-
payload[:data][:children].each do |item|
|
103
|
-
yield item[:data]
|
104
|
-
end
|
105
|
-
# else
|
106
|
-
end
|
107
|
-
ret << payload
|
108
|
-
end
|
109
|
-
# Keep the rate limiter happy
|
110
|
-
sleep 1
|
96
|
+
query = "/by_id/#{chunk.join(",")}.json"
|
97
|
+
response = scrape_posts query, &block
|
98
|
+
ret << response.body
|
111
99
|
end
|
112
100
|
ret
|
113
101
|
end
|
@@ -175,40 +163,40 @@ module Hermaeus
|
|
175
163
|
end
|
176
164
|
.flatten
|
177
165
|
end
|
178
|
-
end
|
179
|
-
end
|
180
166
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
167
|
+
# Internal: Provides the actual functionality for collecting posts.
|
168
|
+
#
|
169
|
+
# query - The reddit API endpoint or path being queried.
|
170
|
+
# opts - Options for the reddit API call
|
171
|
+
# block - This method yields each post fetched to its block.
|
172
|
+
# tries - hidden parameter used to prevent infinite stalling on rate limits.
|
173
|
+
#
|
174
|
+
# Returns reddit's response to the query.
|
175
|
+
def scrape_posts query, tries = 0, **opts, &block
|
176
|
+
begin
|
177
|
+
# Ask reddit to procure our items
|
178
|
+
response = @client.get(query, opts)
|
179
|
+
if response.success?
|
180
|
+
payload = response.body
|
181
|
+
# The payload should be a Listing even for a single-item query; the
|
182
|
+
# :children array will just have one element.
|
183
|
+
if payload[:kind] == "Listing"
|
184
|
+
payload[:data][:children].each do |item|
|
185
|
+
yield item[:data]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
return response
|
189
|
+
end
|
190
|
+
# If at first you don't succeed...
|
191
|
+
rescue Redd::Error::RateLimited => e
|
192
|
+
sleep e.time + 1
|
193
|
+
# Try try again.
|
194
|
+
if tries < 3
|
195
|
+
scrape_posts query, tries + 1
|
196
|
+
else
|
197
|
+
raise RuntimeError, "reddit rate limit will not unlock"
|
198
|
+
end
|
210
199
|
end
|
211
|
-
ret
|
212
200
|
end
|
213
201
|
end
|
214
202
|
end
|
data/lib/hermaeus/config.rb
CHANGED
@@ -14,28 +14,34 @@ module Hermaeus
|
|
14
14
|
# List of allowed types a reddit client can take
|
15
15
|
ALLOWED_TYPES = %w[script web userless installed]
|
16
16
|
|
17
|
+
# Public: Accessor for the loaded and parsed information.
|
18
|
+
#
|
19
|
+
# Returns nil if the config file has yet to be processed.
|
20
|
+
def self.info
|
21
|
+
@info
|
22
|
+
end
|
23
|
+
|
17
24
|
# Public: Load a configuration file into memory
|
18
25
|
#
|
19
26
|
# Returns the configuration file represented as a Hash with Symbol keys
|
20
27
|
def self.load
|
21
|
-
Tomlrb.load_file FILE, symbolize_keys: true
|
28
|
+
@info = Tomlrb.load_file FILE, symbolize_keys: true
|
22
29
|
end
|
23
30
|
|
24
31
|
# Public: Performs validation checks on a configuration structure
|
25
32
|
#
|
26
|
-
# cfg - A Hash with Symbol keys to check for validity
|
27
|
-
#
|
28
33
|
# Returns true if the configuration argument is valid
|
29
34
|
#
|
30
35
|
# Raises a ConfigurationError if the configuration is invalid, with an
|
31
36
|
# error message describing the failure.
|
32
|
-
def self.validate
|
33
|
-
|
34
|
-
|
37
|
+
def self.validate!
|
38
|
+
# Validate the [client] section.
|
39
|
+
raise ConfigurationError.new(<<-EOS) unless @info.has_key? :client
|
35
40
|
Hermaeus’ configuration file must contain a [client] section.
|
36
|
-
|
37
|
-
|
38
|
-
|
41
|
+
EOS
|
42
|
+
|
43
|
+
# Validate the [client] section’s type field.
|
44
|
+
unless @info[:client].has_key?(:type) && ALLOWED_TYPES.include?(@info[:client][:type])
|
39
45
|
raise ConfigurationError.new <<-EOS
|
40
46
|
Hermaeus’ [client] section must include a type key whose value is one of:
|
41
47
|
#{ALLOWED_TYPES.join(", ")}.
|
@@ -44,7 +50,9 @@ Hermaeus’ [client] section must include a type key whose value is one of:
|
|
44
50
|
type = "one of the listed types"
|
45
51
|
EOS
|
46
52
|
end
|
47
|
-
|
53
|
+
|
54
|
+
# Validate the [client] section’s id and secret fields.
|
55
|
+
unless @info[:client].has_key?(:id) && @info[:client].has_key?(:secret)
|
48
56
|
raise ConfigurationError.new <<-EOS
|
49
57
|
Hermaeus’ [client] section must include keys for the ID and secret provided by
|
50
58
|
reddit for your application.
|
@@ -54,8 +62,10 @@ id = "an ID from reddit"
|
|
54
62
|
secret = "a secret from reddit"
|
55
63
|
EOS
|
56
64
|
end
|
57
|
-
|
58
|
-
|
65
|
+
|
66
|
+
# Validate the [client] section’s username and password fields, if needed.
|
67
|
+
if @info[:client][:type] == "script"
|
68
|
+
client = @info[:client]
|
59
69
|
unless client.has_key?(:username) && client.has_key?(:password)
|
60
70
|
raise ConfigurationError.new <<-EOS
|
61
71
|
When configured for `type = "script"`, Hermaeus’ [client] section must include
|
@@ -67,7 +77,36 @@ password = "hunter2"
|
|
67
77
|
EOS
|
68
78
|
end
|
69
79
|
end
|
70
|
-
|
80
|
+
|
81
|
+
# Validate the [archive] section.
|
82
|
+
raise ConfigurationError.new(<<-EOS) unless @info.has_key? :archive
|
83
|
+
Hermaeus’ configuration file must include an [archive] section to govern the
|
84
|
+
storage of downloaded posts.
|
85
|
+
EOS
|
86
|
+
|
87
|
+
raise ConfigurationError.new(<<-EOS) unless @info[:archive].has_key? :path
|
88
|
+
Hermaeus’ [archive] section must include a path field containing a relative or
|
89
|
+
absolute path in which to store the downloaded posts.
|
90
|
+
|
91
|
+
[archive]
|
92
|
+
path = "./archive"
|
93
|
+
# path = "/tmp/teslore/archive"
|
94
|
+
EOS
|
95
|
+
|
96
|
+
# Validate the [index] section.
|
97
|
+
raise ConfigurationError.new(<<-EOS) unless @info.has_key? :index
|
98
|
+
Hermaeus’ configuration file must include and [index] section to govern the
|
99
|
+
processing of the subreddit’s index page.
|
100
|
+
EOS
|
101
|
+
|
102
|
+
raise ConfigurationError.new(<<-EOS) unless @info[:index].has_key? :path
|
103
|
+
Hermaeus’ [index] section must include a path field containing the reddit page
|
104
|
+
at which the index resides.
|
105
|
+
|
106
|
+
[index]
|
107
|
+
path = "/r/teslore/wiki/archive"
|
108
|
+
EOS
|
109
|
+
true
|
71
110
|
end
|
72
111
|
end
|
73
112
|
end
|
data/lib/hermaeus/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hermaeus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- myrrlyn
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|