hermaeus 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 21e1bdd247e29c25253a077f728526d215f91a14
4
- data.tar.gz: 8a01184c5502a2a9f03d1c82ff0d20b7664bc27b
3
+ metadata.gz: ca26f9776acb76bf0097fa0c43b18d1563890084
4
+ data.tar.gz: fb9835383d6e290d90cf8cfa9191033031e469de
5
5
  SHA512:
6
- metadata.gz: 86d808111b779e4a78ec52727e589e7dda9f9f102424fbd4ad1ac9aa3183c5a0989c63023133580970ec5a421e9c6ae3d8865948e396c0d4270041c789089e3f
7
- data.tar.gz: 92660f2b99ea6cc07057606620be793ab62947edd64d4a4e2477e5cda3607fb2578b7475de8c0d194cdfe5f465a2bb33b6d12a0b92db8993cf428a9b167472c2
6
+ metadata.gz: 6111c2eee01f5c4122aa1542510aa8610a2a4542a053c470a7aa333be282dd7616c7f40f3121700a2b8ddb997adbf1cd368a0998641b15bd900603247159d4ed
7
+ data.tar.gz: f81f1158506b2eed59c30a29258cfafefdcf41b3b1f7fdca584a814c4fdf7d87b0d9b81159c80c92f8c9afdc83405f846fdd4cbd0ee97ec82e411be8572453f7
@@ -2,18 +2,29 @@
2
2
 
3
3
  ## v1
4
4
 
5
- ### v1.0.2
5
+ ### v1.1
6
+
7
+ Move the index page out of a magic string and into the configuration file.
8
+
9
+ Update the configuration handler to account for the new information, and provide
10
+ some more helpful error messages on validation failure.
11
+
12
+ Update the Changelog structure slightly.
13
+
14
+ ### v1.0
15
+
16
+ #### v1.0.2
6
17
 
7
18
  Fix a File.open call; it was accidentally being given a string permission set
8
19
  instead of numeric.
9
20
 
10
- ### v1.0.1
21
+ #### v1.0.1
11
22
 
12
23
  Deployment to a separate machine brought some hidden bugs to light. Fixed a
13
24
  syntax error in the initialization routines and added documentation about fresh
14
25
  installations to the README.
15
26
 
16
- ### v1.0.0
27
+ #### v1.0.0
17
28
 
18
29
  Added a storage backend (`Apocryphon` and `Archivist` classes) capable of
19
30
  formatting the retrieved text and storing it on disk.
@@ -21,16 +32,16 @@ formatting the retrieved text and storing it on disk.
21
32
  `mora` is confirmed to work in the wild, and so Hermaeus is ready for a 1.0
22
33
  release.
23
34
 
24
- ## v0
35
+ ### v0
25
36
 
26
37
  Development versions used only for experimentation.
27
38
 
28
- ### v0.2.0
39
+ #### v0.2.0
29
40
 
30
41
  Completed the ability to retrieve texts from reddit and process them enough for
31
42
  demonstration purposes.
32
43
 
33
- ### v0.1.0
44
+ #### v0.1.0
34
45
 
35
46
  Initial version -- Gained the ability to connect to reddit and retrieve basic
36
47
  information.
@@ -16,3 +16,7 @@ password = "hunter2"
16
16
  [archive]
17
17
  # The path where Hermaeus dumps output files.
18
18
  path = "archive"
19
+
20
+ # Settings for the /r/teslore index page
21
+ [index]
22
+ path = "/r/teslore/wiki/archive"
@@ -18,8 +18,17 @@ module Hermaeus
18
18
  def self.init
19
19
  FileUtils.mkdir_p(Config::DIR)
20
20
  if File.exist? Config::FILE
21
- @cfg = Config.load
22
- Config.validate @cfg
21
+ Config.load
22
+ begin
23
+ Config.validate!
24
+ rescue ConfigurationError => e
25
+ puts <<-EOS
26
+ #{e.message}
27
+
28
+ Edit your configuration file (#{File.join Config::DIR, "config.toml"}) to \
29
+ continue.
30
+ EOS
31
+ end
23
32
  else
24
33
  File.open Config::FILE, "w+" do |file|
25
34
  File.open File.expand_path(Config::SOURCE), "r", 0600 do |cfg|
@@ -35,7 +44,7 @@ for Hermaeus to function.
35
44
 
36
45
  # Public: Connects Hermaeus to reddit.
37
46
  def self.connect
38
- @client = Client.new @cfg[:client]
47
+ @client = Client.new
39
48
  end
40
49
 
41
50
  # Public: Downloads Apocrypha posts.
@@ -4,6 +4,8 @@
4
4
  redd
5
5
  ].each(&method(:require))
6
6
 
7
+ include Enumerable
8
+
7
9
  require "hermaeus/config"
8
10
  require "hermaeus/version"
9
11
 
@@ -13,13 +15,10 @@ module Hermaeus
13
15
  class Client
14
16
  USER_AGENT = "Redd/Ruby:Hermaeus:#{Hermaeus::VERSION} (by /u/myrrlyn)"
15
17
  # Public: Connects the Hermaeus::Client to reddit.
16
- #
17
- # info - A Hash with Symbol keys containing reddit connection information.
18
- # It should be the `[:client]` section of the Hash returned by
19
- # `Hermaeus::Config.load`.
20
- def initialize client
21
- Config.validate client: client
22
- @client = Redd.it(client.delete(:type).to_sym, *client.values, user_agent: USER_AGENT)
18
+ def initialize
19
+ Config.validate!
20
+ cfg = Config.info[:client]
21
+ @client = Redd.it(cfg.delete(:type).to_sym, *cfg.values, user_agent: USER_AGENT)
23
22
  @client.authorize!
24
23
  @html_filter = HTMLEntities.new
25
24
  end
@@ -28,7 +27,7 @@ module Hermaeus
28
27
  #
29
28
  # Wraps Client#scrape_index; see it for documentation.
30
29
  def get_global_listing **opts
31
- scrape_index "/r/teslore/wiki/compilation", opts
30
+ scrape_index Config.info[:index][:path], opts
32
31
  end
33
32
 
34
33
  # Public: Scrapes a Weekly Community Thread patch index.
@@ -46,6 +45,8 @@ module Hermaeus
46
45
  ids.map! do |id|
47
46
  "t3_#{id}" unless id.match /^t3_/
48
47
  end
48
+ # TODO: Ensure that this is safe (only query <= 100 IDs at a time), and
49
+ # call the scraper multiple times and reassemble output if necessary.
49
50
  query = "/by_id/#{ids.join(",")}"
50
51
  scrape_index query, opts
51
52
  end
@@ -62,6 +63,7 @@ module Hermaeus
62
63
  # Returns a String Array containing the reddit fullnames harvested from the
63
64
  # input list. Input elements that do not match are stripped.
64
65
  def get_fullnames data, **opts
66
+ # TODO: Move this regex to the configuration file.
65
67
  regex = opts[:regex] || %r(/r/.+/(comments/)?(?<id>[0-9a-z]+)/.+)
66
68
  data.map do |item|
67
69
  m = item.match regex
@@ -89,25 +91,11 @@ module Hermaeus
89
91
  ret = []
90
92
  # reddit has finite limits on acceptable query sizes. Split the list into
91
93
  # manageable portions
92
- fullnames.fracture.each do |chunk|
94
+ fullnames.each_slice(100).each do |chunk|
93
95
  # Assemble the list of reddit objects being queried
94
- query = chunk.join(",")
95
- # Ask reddit to procure our items
96
- response = @client.get("/by_id/#{query}.json")
97
- if response.success?
98
- payload = response.body
99
- # The payload should be a Listing even for a single-item query; the
100
- # :children array will just have one element.
101
- if payload[:kind] == "Listing"
102
- payload[:data][:children].each do |item|
103
- yield item[:data]
104
- end
105
- # else
106
- end
107
- ret << payload
108
- end
109
- # Keep the rate limiter happy
110
- sleep 1
96
+ query = "/by_id/#{chunk.join(",")}.json"
97
+ response = scrape_posts query, &block
98
+ ret << response.body
111
99
  end
112
100
  ret
113
101
  end
@@ -175,40 +163,40 @@ module Hermaeus
175
163
  end
176
164
  .flatten
177
165
  end
178
- end
179
- end
180
166
 
181
- class Array
182
- # Public: Splits an Array into several arrays, each of which has a maximum
183
- # size.
184
- #
185
- # size - The maximum length of each segment. Defaults to 100.
186
- #
187
- # Returns an Array of Arrays. Each element of the returned array is a section
188
- # of the original array.
189
- #
190
- # Examples
191
- #
192
- # %w[a b c d e f g h i j k l m n o p q r s t u v w x y z].fracture 5
193
- # => [
194
- # ["a", "b", "c", "d", "e"],
195
- # ["f", "g", "h", "i", "j"],
196
- # ["k", "l", "m", "n", "o"],
197
- # ["p", "q", "r", "s", "t"],
198
- # ["u", "v", "w", "x", "y"],
199
- # ["z"]
200
- # ]
201
- # %w[hello world].fracture 5 => [["hello", "world"]]
202
- def fracture size = 100
203
- if self.length < size
204
- [self]
205
- else
206
- ret = []
207
- self.each_with_index do |val, idx|
208
- ret[idx / size] ||= []
209
- ret[idx / size] << val
167
+ # Internal: Provides the actual functionality for collecting posts.
168
+ #
169
+ # query - The reddit API endpoint or path being queried.
170
+ # opts - Options for the reddit API call
171
+ # block - This method yields each post fetched to its block.
172
+ # tries - hidden parameter used to prevent infinite stalling on rate limits.
173
+ #
174
+ # Returns reddit's response to the query.
175
+ def scrape_posts query, tries = 0, **opts, &block
176
+ begin
177
+ # Ask reddit to procure our items
178
+ response = @client.get(query, opts)
179
+ if response.success?
180
+ payload = response.body
181
+ # The payload should be a Listing even for a single-item query; the
182
+ # :children array will just have one element.
183
+ if payload[:kind] == "Listing"
184
+ payload[:data][:children].each do |item|
185
+ yield item[:data]
186
+ end
187
+ end
188
+ return response
189
+ end
190
+ # If at first you don't succeed...
191
+ rescue Redd::Error::RateLimited => e
192
+ sleep e.time + 1
193
+ # Try try again.
194
+ if tries < 3
195
+ scrape_posts query, tries + 1
196
+ else
197
+ raise RuntimeError, "reddit rate limit will not unlock"
198
+ end
210
199
  end
211
- ret
212
200
  end
213
201
  end
214
202
  end
@@ -14,28 +14,34 @@ module Hermaeus
14
14
  # List of allowed types a reddit client can take
15
15
  ALLOWED_TYPES = %w[script web userless installed]
16
16
 
17
+ # Public: Accessor for the loaded and parsed information.
18
+ #
19
+ # Returns nil if the config file has yet to be processed.
20
+ def self.info
21
+ @info
22
+ end
23
+
17
24
  # Public: Load a configuration file into memory
18
25
  #
19
26
  # Returns the configuration file represented as a Hash with Symbol keys
20
27
  def self.load
21
- Tomlrb.load_file FILE, symbolize_keys: true
28
+ @info = Tomlrb.load_file FILE, symbolize_keys: true
22
29
  end
23
30
 
24
31
  # Public: Performs validation checks on a configuration structure
25
32
  #
26
- # cfg - A Hash with Symbol keys to check for validity
27
- #
28
33
  # Returns true if the configuration argument is valid
29
34
  #
30
35
  # Raises a ConfigurationError if the configuration is invalid, with an
31
36
  # error message describing the failure.
32
- def self.validate cfg
33
- unless cfg.has_key? :client
34
- raise ConfigurationError.new <<-EOS
37
+ def self.validate!
38
+ # Validate the [client] section.
39
+ raise ConfigurationError.new(<<-EOS) unless @info.has_key? :client
35
40
  Hermaeus’ configuration file must contain a [client] section.
36
- EOS
37
- end
38
- unless cfg[:client].has_key?(:type) && ALLOWED_TYPES.include?(cfg[:client][:type])
41
+ EOS
42
+
43
+ # Validate the [client] section’s type field.
44
+ unless @info[:client].has_key?(:type) && ALLOWED_TYPES.include?(@info[:client][:type])
39
45
  raise ConfigurationError.new <<-EOS
40
46
  Hermaeus’ [client] section must include a type key whose value is one of:
41
47
  #{ALLOWED_TYPES.join(", ")}.
@@ -44,7 +50,9 @@ Hermaeus’ [client] section must include a type key whose value is one of:
44
50
  type = "one of the listed types"
45
51
  EOS
46
52
  end
47
- unless cfg[:client].has_key?(:id) && cfg[:client].has_key?(:secret)
53
+
54
+ # Validate the [client] section’s id and secret fields.
55
+ unless @info[:client].has_key?(:id) && @info[:client].has_key?(:secret)
48
56
  raise ConfigurationError.new <<-EOS
49
57
  Hermaeus’ [client] section must include keys for the ID and secret provided by
50
58
  reddit for your application.
@@ -54,8 +62,10 @@ id = "an ID from reddit"
54
62
  secret = "a secret from reddit"
55
63
  EOS
56
64
  end
57
- if cfg[:client][:type] == "script"
58
- client = cfg[:client]
65
+
66
+ # Validate the [client] section’s username and password fields, if needed.
67
+ if @info[:client][:type] == "script"
68
+ client = @info[:client]
59
69
  unless client.has_key?(:username) && client.has_key?(:password)
60
70
  raise ConfigurationError.new <<-EOS
61
71
  When configured for `type = "script"`, Hermaeus’ [client] section must include
@@ -67,7 +77,36 @@ password = "hunter2"
67
77
  EOS
68
78
  end
69
79
  end
70
- true
80
+
81
+ # Validate the [archive] section.
82
+ raise ConfigurationError.new(<<-EOS) unless @info.has_key? :archive
83
+ Hermaeus’ configuration file must include an [archive] section to govern the
84
+ storage of downloaded posts.
85
+ EOS
86
+
87
+ raise ConfigurationError.new(<<-EOS) unless @info[:archive].has_key? :path
88
+ Hermaeus’ [archive] section must include a path field containing a relative or
89
+ absolute path in which to store the downloaded posts.
90
+
91
+ [archive]
92
+ path = "./archive"
93
+ # path = "/tmp/teslore/archive"
94
+ EOS
95
+
96
+ # Validate the [index] section.
97
+ raise ConfigurationError.new(<<-EOS) unless @info.has_key? :index
98
+ Hermaeus’ configuration file must include and [index] section to govern the
99
+ processing of the subreddit’s index page.
100
+ EOS
101
+
102
+ raise ConfigurationError.new(<<-EOS) unless @info[:index].has_key? :path
103
+ Hermaeus’ [index] section must include a path field containing the reddit page
104
+ at which the index resides.
105
+
106
+ [index]
107
+ path = "/r/teslore/wiki/archive"
108
+ EOS
109
+ true
71
110
  end
72
111
  end
73
112
  end
@@ -1,3 +1,3 @@
1
1
  module Hermaeus
2
- VERSION = "1.0.2"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hermaeus
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - myrrlyn
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-10-12 00:00:00.000000000 Z
11
+ date: 2016-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler