hermaeus 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.editorconfig +10 -0
- data/.gitignore +9 -0
- data/CHANGELOG.md +25 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +51 -0
- data/Rakefile +2 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/data/config.toml +18 -0
- data/data/usage.txt +13 -0
- data/exe/mora +29 -0
- data/hermaeus.gemspec +35 -0
- data/lib/hermaeus.rb +64 -0
- data/lib/hermaeus/apocryphon.rb +33 -0
- data/lib/hermaeus/archivist.rb +58 -0
- data/lib/hermaeus/client.rb +214 -0
- data/lib/hermaeus/config.rb +73 -0
- data/lib/hermaeus/error.rb +19 -0
- data/lib/hermaeus/version.rb +3 -0
- metadata +165 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 36fd56c600a2194bd70996d4c1b1f49c04b6b7c4
|
4
|
+
data.tar.gz: 11e649ff3475b06302a5cbc02218504fe56d2fad
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 732743530fc850a0f0a1a7fd4b4478ec2c487fbe1524efad56ee832ab81eabf19fc6cfd58bfad84f9e38756c1eaca7c7e1fef482082b9290dde666bc96681f5a
|
7
|
+
data.tar.gz: c61797a379bd777382b1a7268b604657052e3b4623e609a7a2a7bdf43b04e3fd69bba1856dad387be4321de6bc69dd508262c095b4d0e271c8069fc5045d5198
|
data/.editorconfig
ADDED
data/.gitignore
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Hermaues Changelog
|
2
|
+
|
3
|
+
## v1
|
4
|
+
|
5
|
+
### v1.0.0
|
6
|
+
|
7
|
+
Added a storage backend (`Apocryphon` and `Archivist` classes) capable of
|
8
|
+
formatting the retrieved text and storing it on disk.
|
9
|
+
|
10
|
+
`mora` is confirmed to work in the wild, and so Hermaeus is ready for a 1.0
|
11
|
+
release.
|
12
|
+
|
13
|
+
## v0
|
14
|
+
|
15
|
+
Development versions used only for experimentation.
|
16
|
+
|
17
|
+
### v0.2.0
|
18
|
+
|
19
|
+
Completed the ability to retrieve texts from reddit and process them enough for
|
20
|
+
demonstration purposes.
|
21
|
+
|
22
|
+
### v0.1.0
|
23
|
+
|
24
|
+
Initial version -- Gained the ability to connect to reddit and retrieve basic
|
25
|
+
information.
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 myrrlyn
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# Hermaeus
|
2
|
+
|
3
|
+
Hermaeus Mora, the Daedric Prince of Fate and Knowledge, hoards information in
|
4
|
+
his halls of Apocrypha.
|
5
|
+
|
6
|
+
/r/teslore maintains a list of Apocryphal texts, but since they are reddit posts
|
7
|
+
by ordinary users, they are at risk of deletion. `Hermaeus` provides a means of
|
8
|
+
collecting and archiving /r/teslore Apocrypha.
|
9
|
+
|
10
|
+
`Hermaeus` works by scraping established index lists on /r/teslore, including
|
11
|
+
the Compendium wiki pages and the weekly Community Threads in which new entries
|
12
|
+
are announced, and collects the Markdown source of the referenced posts.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
Add this line to your application's Gemfile:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem 'hermaeus'
|
20
|
+
```
|
21
|
+
|
22
|
+
And then execute:
|
23
|
+
|
24
|
+
$ bundle
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
|
28
|
+
$ gem install hermaeus
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
`Hermaeus` can be used in other Ruby scripts via top-level methods, or via the
|
33
|
+
`mora` executable.
|
34
|
+
|
35
|
+
## Development
|
36
|
+
|
37
|
+
After checking out the repo, run `bin/setup` to install dependencies. You can
|
38
|
+
also run `bin/console` for an interactive prompt that will allow you to
|
39
|
+
experiment.
|
40
|
+
|
41
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
42
|
+
|
43
|
+
## Contributing
|
44
|
+
|
45
|
+
Bug reports and pull requests are welcome on GitHub at
|
46
|
+
https://github.com/myrrlyn/hermaeus.
|
47
|
+
|
48
|
+
## License
|
49
|
+
|
50
|
+
The gem is available as open source under the terms of the
|
51
|
+
[MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "hermaeus"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
require "pry"
|
11
|
+
Pry.start
|
12
|
+
|
13
|
+
# require "irb"
|
14
|
+
# IRB.start
|
data/bin/setup
ADDED
data/data/config.toml
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Information for Hermaeus' reddit client
|
2
|
+
[client]
|
3
|
+
# Setting this to "script" lets Hermaeus access reddit as a regular user. Which
|
4
|
+
# user, is set belowl
|
5
|
+
type = "script"
|
6
|
+
# Creating a reddit API client yields two strings. The shorter is the ID, the
|
7
|
+
# longer is a secret. Hermaeus needs both to connect.
|
8
|
+
id = "a string from reddit"
|
9
|
+
secret = "a longer string from reddit"
|
10
|
+
# If acting as a user, Hermaeus also needs to identify which user that is. The
|
11
|
+
# reddit username and password go here.
|
12
|
+
username = "Librarian_Robot"
|
13
|
+
password = "hunter2"
|
14
|
+
|
15
|
+
# Settings for the files Hermaeus collects
|
16
|
+
[archive]
|
17
|
+
# The path where Hermaeus dumps output files.
|
18
|
+
path = "archive"
|
data/data/usage.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
`mora` uses subcommands for its operation.
|
2
|
+
|
3
|
+
- 'help' (also '-h' and '--help' to fit expectations) displays this message.
|
4
|
+
|
5
|
+
- 'version' (also '-v' and '--version') prints the version string.
|
6
|
+
|
7
|
+
- 'seek' accesses reddit to download Apocrypha posts. It takes an argument to
|
8
|
+
determine whether to target the global wiki list or a Community Thread:
|
9
|
+
|
10
|
+
- 'seek index' accesses the /r/teslore/wiki/compilation page
|
11
|
+
|
12
|
+
- 'seek com $ID' accesses a Community Thread. This requires one or more post
|
13
|
+
IDs to scan.
|
data/exe/mora
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "hermaeus"
|
4
|
+
|
5
|
+
if ARGV.length == 0 || ARGV[0].match(/^-{0,2}help$/) || ARGV[0] == "-h"
|
6
|
+
Hermaeus.help
|
7
|
+
elsif ARGV[0].match(/^-{0,2}version$/) || ARGV[0] == "-v"
|
8
|
+
puts Hermaeus::VERSION
|
9
|
+
elsif ARGV[0] == "seek"
|
10
|
+
ARGV.shift
|
11
|
+
if ARGV[0].nil?
|
12
|
+
Hermaeus.help
|
13
|
+
else
|
14
|
+
Hermaeus.init
|
15
|
+
Hermaeus.connect
|
16
|
+
type = ARGV.shift
|
17
|
+
if type == "com"
|
18
|
+
raise ArgumentError, "com MUST have thread IDs specified" unless ARGV.length > 0
|
19
|
+
end
|
20
|
+
arc = Hermaeus::Archivist.new
|
21
|
+
Hermaeus.seek(type, ARGV) do |post|
|
22
|
+
apoc = Hermaeus::Apocryphon.new post
|
23
|
+
puts apoc
|
24
|
+
arc.save_to_file apoc
|
25
|
+
end
|
26
|
+
end
|
27
|
+
else
|
28
|
+
Hermaeus.help
|
29
|
+
end
|
data/hermaeus.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'hermaeus/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "hermaeus"
|
8
|
+
spec.version = Hermaeus::VERSION
|
9
|
+
spec.authors = ["myrrlyn"]
|
10
|
+
spec.email = ["myrrlyn@outlook.com"]
|
11
|
+
|
12
|
+
spec.summary = "Archivist for /r/teslore"
|
13
|
+
spec.description = <<-EOS
|
14
|
+
Hermaeus provides archival services for /r/teslore by collecting lists of posts
|
15
|
+
and then downloading the source material those lists reference.
|
16
|
+
EOS
|
17
|
+
spec.homepage = "https://github.com/myrrlyn/hermaeus"
|
18
|
+
spec.license = "MIT"
|
19
|
+
|
20
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
21
|
+
f.match(%r{^(test|spec|features)/})
|
22
|
+
end
|
23
|
+
spec.bindir = "exe"
|
24
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
25
|
+
spec.require_paths = ["lib"]
|
26
|
+
|
27
|
+
spec.add_development_dependency "bundler", "~> 1.13"
|
28
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
29
|
+
spec.add_development_dependency "pry"
|
30
|
+
|
31
|
+
spec.add_dependency "htmlentities", "~> 4.3"
|
32
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
33
|
+
spec.add_dependency "redd", "~> 0.7"
|
34
|
+
spec.add_dependency "tomlrb"
|
35
|
+
end
|
data/lib/hermaeus.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require "hermaeus/apocryphon"
|
2
|
+
require "hermaeus/archivist"
|
3
|
+
require "hermaeus/client"
|
4
|
+
require "hermaeus/config"
|
5
|
+
require "hermaeus/error"
|
6
|
+
require "hermaeus/version"
|
7
|
+
|
8
|
+
require "fileutils"
|
9
|
+
|
10
|
+
# Public: Root module for Hermaeus.
|
11
|
+
#
|
12
|
+
# Hermaeus’ top-level methods provide the interface used by `mora`.
|
13
|
+
module Hermaeus
|
14
|
+
# Public: Initializes Hermaeus for use.
|
15
|
+
#
|
16
|
+
# Raises a ConfigurationError if Hermaeus’ config file does not exist, and
|
17
|
+
# creates a sample configuration file for modification.
|
18
|
+
def self.init
|
19
|
+
FileUtils.mkdir_p(Config::DIR)
|
20
|
+
if File.exist? Config::FILE
|
21
|
+
@cfg = Config.load
|
22
|
+
Config.validate @cfg
|
23
|
+
else
|
24
|
+
File.open Config::FILE, "w+" do |file|
|
25
|
+
File.open File.expand_path Config::SOURCE, "r", "0600" do |cfg|
|
26
|
+
file << cfg.read
|
27
|
+
end
|
28
|
+
end
|
29
|
+
raise ConfigurationError.new <<-EOS
|
30
|
+
You must put your reddit credentials in #{File.join Config::DIR,"config.toml"} \
|
31
|
+
for Hermaeus to function.
|
32
|
+
EOS
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: Connects Hermaeus to reddit.
|
37
|
+
def self.connect
|
38
|
+
@client = Client.new @cfg[:client]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Public: Downloads Apocrypha posts.
|
42
|
+
#
|
43
|
+
# type - "index" or "com"
|
44
|
+
# ids - A list of thread IDs to access and scrape, if type is "com"
|
45
|
+
def self.seek type, ids, &block
|
46
|
+
if type == "index"
|
47
|
+
list = @client.get_global_listing
|
48
|
+
elsif type == "com"
|
49
|
+
list = @client.get_weekly_listing ids
|
50
|
+
end
|
51
|
+
ids = @client.get_fullnames list
|
52
|
+
posts = @client.get_posts ids, &block
|
53
|
+
end
|
54
|
+
|
55
|
+
# Public: Print usage information for `mora`.
|
56
|
+
#
|
57
|
+
# `mora` may not know where Hermaeus is installed, so Hermaeus has to load the
|
58
|
+
# help file for it.
|
59
|
+
def self.help
|
60
|
+
File.open File.join(File.dirname(__FILE__), "..", "data", "usage.txt") do |f|
|
61
|
+
puts f.read
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Hermaeus
|
2
|
+
# Public: Data structure describing a Compendium entry.
|
3
|
+
class Apocryphon
|
4
|
+
# Public: Constructs an Apocryphon from reddit data responses.
|
5
|
+
#
|
6
|
+
# data - A Hash emitted by Client#get_posts
|
7
|
+
def initialize data
|
8
|
+
@data = data
|
9
|
+
end
|
10
|
+
|
11
|
+
# Public: Serializes the Apocryphon item to a string.
|
12
|
+
#
|
13
|
+
# Returns a String containing the title and author.
|
14
|
+
def to_s
|
15
|
+
"#{self.title} – by #{self.author}"
|
16
|
+
end
|
17
|
+
|
18
|
+
# Public: Permit method-style access to the underlying data Hash's keys.
|
19
|
+
def method_missing name, *args, &block
|
20
|
+
@data[name.to_sym]
|
21
|
+
end
|
22
|
+
|
23
|
+
# Public: Accessor for the Apocryphon's Markdown text.
|
24
|
+
def text
|
25
|
+
@data[:selftext]
|
26
|
+
end
|
27
|
+
|
28
|
+
# Public: Accessor for the Apocryphon's HTML as compiled by reddit.
|
29
|
+
def html
|
30
|
+
data[:selftext_html]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require "hermaeus/config"
|
2
|
+
|
3
|
+
module Hermaeus
|
4
|
+
class Archivist
|
5
|
+
def initialize
|
6
|
+
@html_filter = HTMLEntities.new
|
7
|
+
@config = Config.load[:archive]
|
8
|
+
FileUtils.mkdir_p @config[:path]
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_metadata apoc
|
12
|
+
str = <<-EOS
|
13
|
+
---
|
14
|
+
author: #{apoc.author}
|
15
|
+
title: #{apoc.title}
|
16
|
+
date: #{Time.at(apoc.created.to_i).iso8601}
|
17
|
+
reddit: #{apoc.id}
|
18
|
+
---
|
19
|
+
|
20
|
+
EOS
|
21
|
+
end
|
22
|
+
|
23
|
+
def save_to_file apoc
|
24
|
+
unless apoc.text == "[deleted]" || apoc.text == "[removed]"
|
25
|
+
title = apoc.title.downcase.gsub(/[ \/]/, "_").gsub(/[:"']/, "") + ".html.md"
|
26
|
+
File.open(File.join(@config[:path], title), "w+") do |file|
|
27
|
+
file << add_metadata(apoc)
|
28
|
+
file << prettify(apoc.text)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def prettify text, length: 80
|
34
|
+
@html_filter.decode(text)
|
35
|
+
.split("\n").map do |line|
|
36
|
+
# Put the newline back in
|
37
|
+
line << "\n"
|
38
|
+
break_line line
|
39
|
+
end
|
40
|
+
.join
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def break_line line, length: 80
|
46
|
+
if line.length > length + 1
|
47
|
+
left, right = line[0...length], line[length...line.length]
|
48
|
+
cut = left.rindex " "
|
49
|
+
if cut
|
50
|
+
left, right = line[0...cut] << "\n", line[(cut + 1)...line.length]
|
51
|
+
end
|
52
|
+
right = break_line right, length: length
|
53
|
+
line = left.concat right
|
54
|
+
end
|
55
|
+
line
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
%w[
|
2
|
+
htmlentities
|
3
|
+
nokogiri
|
4
|
+
redd
|
5
|
+
].each(&method(:require))
|
6
|
+
|
7
|
+
require "hermaeus/config"
|
8
|
+
require "hermaeus/version"
|
9
|
+
|
10
|
+
module Hermaeus
|
11
|
+
# Public: Wraps a reddit client for access to reddit's API, and provides
|
12
|
+
# methods for downloading posts from reddit.
|
13
|
+
class Client
|
14
|
+
USER_AGENT = "Redd/Ruby:Hermaeus:#{Hermaeus::VERSION} (by /u/myrrlyn)"
|
15
|
+
# Public: Connects the Hermaeus::Client to reddit.
|
16
|
+
#
|
17
|
+
# info - A Hash with Symbol keys containing reddit connection information.
|
18
|
+
# It should be the `[:client]` section of the Hash returned by
|
19
|
+
# `Hermaeus::Config.load`.
|
20
|
+
def initialize client
|
21
|
+
Config.validate client: client
|
22
|
+
@client = Redd.it(client.delete(:type).to_sym, *client.values, user_agent: USER_AGENT)
|
23
|
+
@client.authorize!
|
24
|
+
@html_filter = HTMLEntities.new
|
25
|
+
end
|
26
|
+
|
27
|
+
# Public: Scrapes the Compilation full index.
|
28
|
+
#
|
29
|
+
# Wraps Client#scrape_index; see it for documentation.
|
30
|
+
def get_global_listing **opts
|
31
|
+
scrape_index "/r/teslore/wiki/compilation", opts
|
32
|
+
end
|
33
|
+
|
34
|
+
# Public: Scrapes a Weekly Community Thread patch index.
|
35
|
+
#
|
36
|
+
# ids - A String Array of reddit post IDs for Weekly Community Threads.
|
37
|
+
#
|
38
|
+
# Examples:
|
39
|
+
#
|
40
|
+
# get_weekly_listing "56j7pq" # Targets one Community Thread
|
41
|
+
# get_weekly_listing "56j7pq", "55erkr" # Targets two Community Threads
|
42
|
+
# get_weekly_listing "55erkr", css: "td:last-child a" # Custom CSS selector
|
43
|
+
#
|
44
|
+
# Wraps Client#scrape_index; see it for documentation.
|
45
|
+
def get_weekly_listing ids, **opts
|
46
|
+
ids.map! do |id|
|
47
|
+
"t3_#{id}" unless id.match /^t3_/
|
48
|
+
end
|
49
|
+
query = "/by_id/#{ids.join(",")}"
|
50
|
+
scrape_index query, opts
|
51
|
+
end
|
52
|
+
|
53
|
+
# Public: Transforms a list of raw reddit links ("/r/SUB/comments/ID/NAME")
|
54
|
+
# into their reddit fullname ("t3_ID").
|
55
|
+
#
|
56
|
+
# data - A String Array such as that returned by get_global_listing.
|
57
|
+
#
|
58
|
+
# Optional parameters:
|
59
|
+
#
|
60
|
+
# regex: A Regular Expression used to match the reddit ID out of a link.
|
61
|
+
#
|
62
|
+
# Returns a String Array containing the reddit fullnames harvested from the
|
63
|
+
# input list. Input elements that do not match are stripped.
|
64
|
+
def get_fullnames data, **opts
|
65
|
+
regex = opts[:regex] || %r(/r/.+/(comments/)?(?<id>[0-9a-z]+)/.+)
|
66
|
+
data.map do |item|
|
67
|
+
m = item.match regex
|
68
|
+
"t3_#{m[:id]}" if m
|
69
|
+
end
|
70
|
+
.reject { |item| item.nil? }
|
71
|
+
end
|
72
|
+
|
73
|
+
# Public: Collects posts from reddit.
|
74
|
+
#
|
75
|
+
# fullnames - A String Array of reddit fullnames ("tNUM_ID", following
|
76
|
+
# reddit documentation) to query.
|
77
|
+
#
|
78
|
+
# Yields a sequence of Hashes, each describing a reddit post.
|
79
|
+
#
|
80
|
+
# Returns an Array of the response bodies from the reddit call(s).
|
81
|
+
#
|
82
|
+
# Examples
|
83
|
+
#
|
84
|
+
# get_posts get_fullnames get_global_listing do |post|
|
85
|
+
# puts post[:selftext] # Prints the Markdown source of each post
|
86
|
+
# end
|
87
|
+
# => returns an array of hashes, each of which includes an array of posts.
|
88
|
+
def get_posts fullnames, &block
|
89
|
+
ret = []
|
90
|
+
# reddit has finite limits on acceptable query sizes. Split the list into
|
91
|
+
# manageable portions
|
92
|
+
fullnames.fracture.each do |chunk|
|
93
|
+
# Assemble the list of reddit objects being queried
|
94
|
+
query = chunk.join(",")
|
95
|
+
# Ask reddit to procure our items
|
96
|
+
response = @client.get("/by_id/#{query}.json")
|
97
|
+
if response.success?
|
98
|
+
payload = response.body
|
99
|
+
# The payload should be a Listing even for a single-item query; the
|
100
|
+
# :children array will just have one element.
|
101
|
+
if payload[:kind] == "Listing"
|
102
|
+
payload[:data][:children].each do |item|
|
103
|
+
yield item[:data]
|
104
|
+
end
|
105
|
+
# else
|
106
|
+
end
|
107
|
+
ret << payload
|
108
|
+
end
|
109
|
+
# Keep the rate limiter happy
|
110
|
+
sleep 1
|
111
|
+
end
|
112
|
+
ret
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
# Internal: Governs the actual functionality of the index scrapers.
|
118
|
+
#
|
119
|
+
# path - The reddit API or path being queried. It can be a post ID/fullname
|
120
|
+
# or a full URI.
|
121
|
+
#
|
122
|
+
# Optional parameters:
|
123
|
+
#
|
124
|
+
# css: The CSS selector string used to get the links referenced on the page.
|
125
|
+
#
|
126
|
+
# Returns an array of all the referenced links. These links will need to be
|
127
|
+
# broken down into reddit fullnames before Hermaeus can download them.
|
128
|
+
def scrape_index path, **opts
|
129
|
+
# This is a magic string that targets the index format /r/teslore uses to
|
130
|
+
# enumerate their Compendium, in the wiki page and weekly patch posts.
|
131
|
+
query = opts[:css] || "td:first-child a"
|
132
|
+
# Reddit will respond with an HTML dump, if we are querying a wiki page,
|
133
|
+
# or a wrapped HTML dump, if we are querying a post.
|
134
|
+
fetch = @client.get(path).body
|
135
|
+
# Set fetch to be an array of hashes which have the desired text as a
|
136
|
+
# direct child.
|
137
|
+
if fetch[:kind] == "wikipage"
|
138
|
+
fetch = [fetch[:data]]
|
139
|
+
elsif fetch[:kind] == "Listing"
|
140
|
+
fetch = fetch[:data][:children].map { |c| c[:data] }
|
141
|
+
end
|
142
|
+
# reddit will put the text data in :content_html if we queried a wikipage,
|
143
|
+
# or :selftext_html if we queried a post. The two keys are mutually
|
144
|
+
# exclusive, so this simply looks for both and remaps fetch items to point
|
145
|
+
# to the actual data.
|
146
|
+
[:content_html, :selftext_html].each do |k|
|
147
|
+
fetch.map! do |item|
|
148
|
+
if item.respond_to?(:has_key?) && item.has_key?(k)
|
149
|
+
item[k]
|
150
|
+
else
|
151
|
+
item
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
# Ruby doesn't like having comments between each successive map block.
|
156
|
+
# This sequence performs the following transformations on each entry in
|
157
|
+
# the fetched list.
|
158
|
+
# 1. Unescape the HTML text.
|
159
|
+
# 2. Process the HTML text into data structures.
|
160
|
+
# 3. Run CSS queries on the data structures to find the links sought.
|
161
|
+
# 4. Unwrap the link elements to get the URI at which they point.
|
162
|
+
# 5. In the event that multiple pages were queried to get data, the array
|
163
|
+
# that each of those queries returns is flattened so that this method only
|
164
|
+
# returns one single array of link URIs.
|
165
|
+
fetch.map do |item|
|
166
|
+
@html_filter.decode(item)
|
167
|
+
end
|
168
|
+
.map do |item|
|
169
|
+
Nokogiri::HTML(item)
|
170
|
+
end
|
171
|
+
.map do |item|
|
172
|
+
item.css(query).map do |item|
|
173
|
+
item.attributes["href"].value
|
174
|
+
end
|
175
|
+
end
|
176
|
+
.flatten
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
class Array
|
182
|
+
# Public: Splits an Array into several arrays, each of which has a maximum
|
183
|
+
# size.
|
184
|
+
#
|
185
|
+
# size - The maximum length of each segment. Defaults to 100.
|
186
|
+
#
|
187
|
+
# Returns an Array of Arrays. Each element of the returned array is a section
|
188
|
+
# of the original array.
|
189
|
+
#
|
190
|
+
# Examples
|
191
|
+
#
|
192
|
+
# %w[a b c d e f g h i j k l m n o p q r s t u v w x y z].fracture 5
|
193
|
+
# => [
|
194
|
+
# ["a", "b", "c", "d", "e"],
|
195
|
+
# ["f", "g", "h", "i", "j"],
|
196
|
+
# ["k", "l", "m", "n", "o"],
|
197
|
+
# ["p", "q", "r", "s", "t"],
|
198
|
+
# ["u", "v", "w", "x", "y"],
|
199
|
+
# ["z"]
|
200
|
+
# ]
|
201
|
+
# %w[hello world].fracture 5 => [["hello", "world"]]
|
202
|
+
def fracture size = 100
|
203
|
+
if self.length < size
|
204
|
+
[self]
|
205
|
+
else
|
206
|
+
ret = []
|
207
|
+
self.each_with_index do |val, idx|
|
208
|
+
ret[idx / size] ||= []
|
209
|
+
ret[idx / size] << val
|
210
|
+
end
|
211
|
+
ret
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require "hermaeus/error"
|
2
|
+
|
3
|
+
require "tomlrb"
|
4
|
+
|
5
|
+
module Hermaeus
|
6
|
+
# Public: Provides configuration services for Hermaeus
|
7
|
+
module Config
|
8
|
+
# Directory where Hermaeus configuration files are stored
|
9
|
+
DIR = File.join Dir.home, ".hermaeus"
|
10
|
+
# File in which Hermaeus configuration values are stored
|
11
|
+
FILE = File.join DIR, "config.toml"
|
12
|
+
# Configuration template in Hermaeus’ source code
|
13
|
+
SOURCE = File.join(File.dirname(__FILE__), "..", "..", "data", "config.toml")
|
14
|
+
# List of allowed types a reddit client can take
|
15
|
+
ALLOWED_TYPES = %w[script web userless installed]
|
16
|
+
|
17
|
+
# Public: Load a configuration file into memory
|
18
|
+
#
|
19
|
+
# Returns the configuration file represented as a Hash with Symbol keys
|
20
|
+
def self.load
|
21
|
+
Tomlrb.load_file FILE, symbolize_keys: true
|
22
|
+
end
|
23
|
+
|
24
|
+
# Public: Performs validation checks on a configuration structure
|
25
|
+
#
|
26
|
+
# cfg - A Hash with Symbol keys to check for validity
|
27
|
+
#
|
28
|
+
# Returns true if the configuration argument is valid
|
29
|
+
#
|
30
|
+
# Raises a ConfigurationError if the configuration is invalid, with an
|
31
|
+
# error message describing the failure.
|
32
|
+
def self.validate cfg
|
33
|
+
unless cfg.has_key? :client
|
34
|
+
raise ConfigurationError.new <<-EOS
|
35
|
+
Hermaeus’ configuration file must contain a [client] section.
|
36
|
+
EOS
|
37
|
+
end
|
38
|
+
unless cfg[:client].has_key?(:type) && ALLOWED_TYPES.include?(cfg[:client][:type])
|
39
|
+
raise ConfigurationError.new <<-EOS
|
40
|
+
Hermaeus’ [client] section must include a type key whose value is one of:
|
41
|
+
#{ALLOWED_TYPES.join(", ")}.
|
42
|
+
|
43
|
+
[client]
|
44
|
+
type = "one of the listed types"
|
45
|
+
EOS
|
46
|
+
end
|
47
|
+
unless cfg[:client].has_key?(:id) && cfg[:client].has_key?(:secret)
|
48
|
+
raise ConfigurationError.new <<-EOS
|
49
|
+
Hermaeus’ [client] section must include keys for the ID and secret provided by
|
50
|
+
reddit for your application.
|
51
|
+
|
52
|
+
[client]
|
53
|
+
id = "an ID from reddit"
|
54
|
+
secret = "a secret from reddit"
|
55
|
+
EOS
|
56
|
+
end
|
57
|
+
if cfg[:client][:type] == "script"
|
58
|
+
client = cfg[:client]
|
59
|
+
unless client.has_key?(:username) && client.has_key?(:password)
|
60
|
+
raise ConfigurationError.new <<-EOS
|
61
|
+
When configured for `type = "script"`, Hermaeus’ [client] section must include
|
62
|
+
keys for the reddit account username and password as which it will work.
|
63
|
+
|
64
|
+
[client]
|
65
|
+
username = "a_reddit_username"
|
66
|
+
password = "hunter2"
|
67
|
+
EOS
|
68
|
+
end
|
69
|
+
end
|
70
|
+
true
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Hermaeus
|
2
|
+
# Public: Describes an error with the configuration file.
|
3
|
+
class ConfigurationError < Exception
|
4
|
+
# Public: Describes a configuraton error with a given message.
|
5
|
+
#
|
6
|
+
# message - an optional String describing what went wrong. Default value is
|
7
|
+
# "Hermaeus is incorrectly configured."
|
8
|
+
def initialize message
|
9
|
+
@message = message || "Hermaeus is incorrectly configured."
|
10
|
+
end
|
11
|
+
|
12
|
+
# Public: Serializes the error to a String.
|
13
|
+
#
|
14
|
+
# Returns a String representing the error.
|
15
|
+
def to_s
|
16
|
+
@message
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
metadata
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hermaeus
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- myrrlyn
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-10-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.13'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.13'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: htmlentities
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.3'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '4.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: redd
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.7'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.7'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tomlrb
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: |
|
112
|
+
Hermaeus provides archival services for /r/teslore by collecting lists of posts
|
113
|
+
and then downloading the source material those lists reference.
|
114
|
+
email:
|
115
|
+
- myrrlyn@outlook.com
|
116
|
+
executables:
|
117
|
+
- mora
|
118
|
+
extensions: []
|
119
|
+
extra_rdoc_files: []
|
120
|
+
files:
|
121
|
+
- ".editorconfig"
|
122
|
+
- ".gitignore"
|
123
|
+
- CHANGELOG.md
|
124
|
+
- Gemfile
|
125
|
+
- LICENSE.txt
|
126
|
+
- README.md
|
127
|
+
- Rakefile
|
128
|
+
- bin/console
|
129
|
+
- bin/setup
|
130
|
+
- data/config.toml
|
131
|
+
- data/usage.txt
|
132
|
+
- exe/mora
|
133
|
+
- hermaeus.gemspec
|
134
|
+
- lib/hermaeus.rb
|
135
|
+
- lib/hermaeus/apocryphon.rb
|
136
|
+
- lib/hermaeus/archivist.rb
|
137
|
+
- lib/hermaeus/client.rb
|
138
|
+
- lib/hermaeus/config.rb
|
139
|
+
- lib/hermaeus/error.rb
|
140
|
+
- lib/hermaeus/version.rb
|
141
|
+
homepage: https://github.com/myrrlyn/hermaeus
|
142
|
+
licenses:
|
143
|
+
- MIT
|
144
|
+
metadata: {}
|
145
|
+
post_install_message:
|
146
|
+
rdoc_options: []
|
147
|
+
require_paths:
|
148
|
+
- lib
|
149
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: '0'
|
154
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - ">="
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '0'
|
159
|
+
requirements: []
|
160
|
+
rubyforge_project:
|
161
|
+
rubygems_version: 2.5.1
|
162
|
+
signing_key:
|
163
|
+
specification_version: 4
|
164
|
+
summary: Archivist for /r/teslore
|
165
|
+
test_files: []
|