enter-rockstar 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +8 -0
- data/.travis.yml +6 -3
- data/CHANGELOG.md +7 -0
- data/Gemfile.lock +27 -23
- data/README.md +23 -3
- data/TODO.md +26 -0
- data/enter-rockstar.gemspec +5 -2
- data/lib/enter_rockstar/cli.rb +16 -10
- data/lib/enter_rockstar/corpus/tokenizer.rb +52 -1
- data/lib/enter_rockstar/generator/poetic.rb +38 -3
- data/lib/enter_rockstar/scraper/wikia.rb +22 -31
- data/lib/enter_rockstar/utils.rb +36 -0
- data/lib/enter_rockstar/version.rb +1 -1
- data/lib/enter_rockstar.rb +6 -0
- data/lyrics_data/heavy_metal_tokens.json.gz +0 -0
- data/lyrics_data/power_metal_stats.json.gz +0 -0
- data/lyrics_data/power_metal_tokens.json.gz +0 -0
- data/lyrics_data/test_stats.json.gz +0 -0
- data/lyrics_data/test_tokens.json.gz +0 -0
- data/lyrics_data/wikia_heavy_metal.json.gz +0 -0
- data/lyrics_data/wikia_power_metal.json.gz +0 -0
- metadata +55 -5
- data/lyrics_data/wikia_heavy_metal.json +0 -1
- data/lyrics_data/wikia_power_metal.json +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea03c06ad39dbfe4e9da6108fef07a4d7f45a6b81900d20d24067eb634f2c5b9
|
4
|
+
data.tar.gz: 22276980116c4dfd719162ebab301eb44fd4616739a79c26aac061da13c92aa4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d7f7a0fc4ca05c53f6c102f868fed24f74fa466bd6d96bea8f15dcd38b36db3bfb445557f3e79c440717738457f7d3fb069245e50c382bc09f4e41216a60191f
|
7
|
+
data.tar.gz: 141e7a57e51358471d5c4b662833e47c2109cf98f3c625698afe26b7695a039bbd37744e6ea745059b8bb39645f7fccaaa422abeaf648a26b393d5660e3df65d
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
ADDED
data/Gemfile.lock
CHANGED
@@ -3,64 +3,67 @@ PATH
|
|
3
3
|
specs:
|
4
4
|
enter-rockstar (0.1)
|
5
5
|
nokogiri
|
6
|
+
progressbar
|
6
7
|
thor
|
8
|
+
whatlanguage
|
7
9
|
|
8
10
|
GEM
|
9
11
|
remote: https://rubygems.org/
|
10
12
|
specs:
|
11
|
-
addressable (2.
|
12
|
-
public_suffix (>= 2.0.2, <
|
13
|
+
addressable (2.7.0)
|
14
|
+
public_suffix (>= 2.0.2, < 5.0)
|
13
15
|
ast (2.4.0)
|
14
16
|
coderay (1.1.2)
|
15
17
|
crack (0.4.3)
|
16
18
|
safe_yaml (~> 1.0.0)
|
17
19
|
diff-lcs (1.3)
|
18
|
-
|
19
|
-
|
20
|
+
fakefs (0.20.1)
|
21
|
+
hashdiff (1.0.0)
|
22
|
+
jaro_winkler (1.5.3)
|
20
23
|
method_source (0.9.2)
|
21
24
|
mini_portile2 (2.4.0)
|
22
|
-
nokogiri (1.10.
|
25
|
+
nokogiri (1.10.4)
|
23
26
|
mini_portile2 (~> 2.4.0)
|
24
|
-
parallel (1.
|
25
|
-
parser (2.6.
|
27
|
+
parallel (1.17.0)
|
28
|
+
parser (2.6.4.1)
|
26
29
|
ast (~> 2.4.0)
|
30
|
+
progressbar (1.10.1)
|
27
31
|
pry (0.12.2)
|
28
32
|
coderay (~> 1.1.0)
|
29
33
|
method_source (~> 0.9.0)
|
30
|
-
|
31
|
-
public_suffix (3.0.3)
|
34
|
+
public_suffix (4.0.1)
|
32
35
|
rainbow (3.0.0)
|
33
|
-
rake (12.3.
|
36
|
+
rake (12.3.3)
|
34
37
|
rspec (3.8.0)
|
35
38
|
rspec-core (~> 3.8.0)
|
36
39
|
rspec-expectations (~> 3.8.0)
|
37
40
|
rspec-mocks (~> 3.8.0)
|
38
|
-
rspec-core (3.8.
|
41
|
+
rspec-core (3.8.2)
|
39
42
|
rspec-support (~> 3.8.0)
|
40
|
-
rspec-expectations (3.8.
|
43
|
+
rspec-expectations (3.8.4)
|
41
44
|
diff-lcs (>= 1.2.0, < 2.0)
|
42
45
|
rspec-support (~> 3.8.0)
|
43
|
-
rspec-mocks (3.8.
|
46
|
+
rspec-mocks (3.8.1)
|
44
47
|
diff-lcs (>= 1.2.0, < 2.0)
|
45
48
|
rspec-support (~> 3.8.0)
|
46
|
-
rspec-support (3.8.
|
47
|
-
rubocop (0.
|
49
|
+
rspec-support (3.8.2)
|
50
|
+
rubocop (0.74.0)
|
48
51
|
jaro_winkler (~> 1.5.1)
|
49
52
|
parallel (~> 1.10)
|
50
|
-
parser (>= 2.
|
51
|
-
psych (>= 3.1.0)
|
53
|
+
parser (>= 2.6)
|
52
54
|
rainbow (>= 2.2.2, < 4.0)
|
53
55
|
ruby-progressbar (~> 1.7)
|
54
|
-
unicode-display_width (>= 1.4.0, < 1.
|
55
|
-
ruby-progressbar (1.10.
|
56
|
+
unicode-display_width (>= 1.4.0, < 1.7)
|
57
|
+
ruby-progressbar (1.10.1)
|
56
58
|
safe_yaml (1.0.5)
|
57
59
|
thor (0.20.3)
|
58
|
-
unicode-display_width (1.
|
59
|
-
vcr (
|
60
|
-
webmock (3.5
|
60
|
+
unicode-display_width (1.6.0)
|
61
|
+
vcr (5.0.0)
|
62
|
+
webmock (3.7.5)
|
61
63
|
addressable (>= 2.3.6)
|
62
64
|
crack (>= 0.3.2)
|
63
|
-
hashdiff
|
65
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
66
|
+
whatlanguage (1.0.6)
|
64
67
|
|
65
68
|
PLATFORMS
|
66
69
|
ruby
|
@@ -68,6 +71,7 @@ PLATFORMS
|
|
68
71
|
DEPENDENCIES
|
69
72
|
bundler (~> 2.0)
|
70
73
|
enter-rockstar!
|
74
|
+
fakefs
|
71
75
|
pry
|
72
76
|
rake
|
73
77
|
rspec
|
data/README.md
CHANGED
@@ -1,10 +1,24 @@
|
|
1
|
-
[![Build Status](https://travis-ci.
|
1
|
+
[![Build Status](https://travis-ci.com/marcinruszkiewicz/enter-rockstar.svg?branch=master)](https://travis-ci.com/marcinruszkiewicz/enter-rockstar)
|
2
2
|
[![Gem Version](https://badge.fury.io/rb/enter-rockstar.svg)](https://badge.fury.io/rb/enter-rockstar)
|
3
3
|
|
4
4
|
# Enter Rockstar - a tool to help with programming in Rockstar
|
5
5
|
|
6
6
|
This is a set of tools that help Rockstar programmers create programs in the [Rockstar language](https://github.com/RockstarLang/rockstar).
|
7
7
|
|
8
|
+
Basically this allows you (with some setup) to do something like this:
|
9
|
+
|
10
|
+
```
|
11
|
+
$ enter-rockstar poetic 561 lyrics_data/sentenced_tokens.json
|
12
|
+
agony faster desperation
|
13
|
+
swept dismay mothafuckin
|
14
|
+
knees desire melancholic
|
15
|
+
fears facial destruction
|
16
|
+
doing sweats generations
|
17
|
+
|
18
|
+
```
|
19
|
+
|
20
|
+
Which helps greatly with writing cool looking Rockstar programs.
|
21
|
+
|
8
22
|
For details on what is done and what I'm still working on, see the TODO.md and CHANGELOG.md files.
|
9
23
|
|
10
24
|
## Installation
|
@@ -43,6 +57,12 @@ After this command finishes (which might take a long time depending on what cate
|
|
43
57
|
|
44
58
|
Now that you have a set of lyrics, it's time to convert them into something that Enter Rockstar can use.
|
45
59
|
|
60
|
+
```
|
61
|
+
$ enter-rockstar tokenize power_metal lyrics/
|
62
|
+
```
|
63
|
+
|
64
|
+
Depending on the amount of lyrics you feed into this command, this can take a lot of time. You should also be aware that lyrics that aren't in English will be skipped, as Rockstar isn't really supporting other human languages right now.
|
65
|
+
|
46
66
|
## Generating lyrics
|
47
67
|
|
48
68
|
### Finding words for poetic literals
|
@@ -50,10 +70,10 @@ Now that you have a set of lyrics, it's time to convert them into something that
|
|
50
70
|
The most common and basic function of Enter Rockstar is just finding interesting words of good length to use in the poetic numeral representations. Finding out what words to use to represent `123` is not as easy as it might sound and this makes it easier:
|
51
71
|
|
52
72
|
```
|
53
|
-
$ enter-rockstar poetic 123
|
73
|
+
$ enter-rockstar poetic 123 lyrics_data/power_metal_tokens.json --amount 10
|
54
74
|
```
|
55
75
|
|
56
|
-
|
76
|
+
The second argument should be a json tokens list generated in a previous step. You can also skip it, at which point Enter Rockstar will use its built-in list generated from Heavy Metal category.
|
57
77
|
|
58
78
|
## Contributing
|
59
79
|
|
data/TODO.md
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# 0.2
|
2
|
+
|
3
|
+
Features:
|
4
|
+
|
5
|
+
- [x] CLI for scraping lyrics and parsing them to JSON datasets
|
6
|
+
- [x] Initial dataset from Heavy Metal lyrics included in the gem
|
7
|
+
- [x] Random strategy and CLI for generating poetic representations of numbers
|
8
|
+
- [x] Make sure that the generator works with both Floats and Integers
|
9
|
+
- [x] Gzip/Gunzip data JSONs for smaller file sizes
|
10
|
+
- [ ] Generator should fall back on bundled token set
|
11
|
+
|
12
|
+
Other:
|
13
|
+
|
14
|
+
- [x] Better progress indicator for parsing files
|
15
|
+
- [ ] Test the wikia scraper fully
|
16
|
+
- [x] Test generating poetic representations
|
17
|
+
- [ ] Test CLI
|
18
|
+
- [x] Fix tests on Ruby 2.3 and 2.4
|
19
|
+
- [ ] Make sure that the basic tokens gzip is distributed with the gem so it's actually useful without spending
|
20
|
+
a day on scraping wikia
|
21
|
+
|
22
|
+
# Future improvements
|
23
|
+
|
24
|
+
- [ ] Add better generation strategies so the generated numbers potentially make more sense
|
25
|
+
- [ ] Add a complex generator that generates variable names together with numbers
|
26
|
+
- [ ] Add a translator so you can write minimal Rockstar and translate it automatically to proper Rockstar code
|
data/enter-rockstar.gemspec
CHANGED
@@ -24,13 +24,16 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.required_ruby_version = '>= 2.3'
|
25
25
|
|
26
26
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
27
|
+
spec.add_development_dependency 'fakefs'
|
27
28
|
spec.add_development_dependency 'pry'
|
28
29
|
spec.add_development_dependency 'rake'
|
29
30
|
spec.add_development_dependency 'rspec'
|
30
31
|
spec.add_development_dependency 'rubocop'
|
31
32
|
spec.add_development_dependency 'vcr'
|
32
33
|
spec.add_development_dependency 'webmock'
|
33
|
-
|
34
|
-
spec.add_dependency 'thor'
|
34
|
+
|
35
35
|
spec.add_dependency 'nokogiri'
|
36
|
+
spec.add_dependency 'progressbar'
|
37
|
+
spec.add_dependency 'thor'
|
38
|
+
spec.add_dependency 'whatlanguage'
|
36
39
|
end
|
data/lib/enter_rockstar/cli.rb
CHANGED
@@ -11,13 +11,13 @@ module EnterRockstar
|
|
11
11
|
desc 'scrape_category CATEGORY_NAME URL', 'scrape lyrics wikia category page for bands and albums'
|
12
12
|
def scrape_category(category_name, url)
|
13
13
|
scraper = EnterRockstar::Scraper::Wikia.new(category_name: category_name, url: url)
|
14
|
-
scraper.parse_category
|
14
|
+
scraper.parse_category
|
15
15
|
scraper.save_category
|
16
16
|
say
|
17
17
|
end
|
18
18
|
|
19
19
|
desc 'scrape_lyrics CATEGORY_NAME START_INDEX', 'scrape actual lyrics from the lyrics wikia using the generated json file'
|
20
|
-
def scrape_lyrics(category_name, start_index=0)
|
20
|
+
def scrape_lyrics(category_name, start_index = 0)
|
21
21
|
scraper = EnterRockstar::Scraper::Wikia.new(category_name: category_name)
|
22
22
|
scraper.load_saved_json
|
23
23
|
scraper.parse_all_pages(start_index: start_index)
|
@@ -30,18 +30,24 @@ module EnterRockstar
|
|
30
30
|
scraper.print_indexed_tree
|
31
31
|
end
|
32
32
|
|
33
|
-
desc 'tokenize DATA_DIR', 'take the downloaded lyrics text files and tokenize them'
|
34
|
-
def tokenize(data_dir)
|
35
|
-
tokenizer = EnterRockstar::Corpus::Tokenizer.new(data_dir: data_dir)
|
33
|
+
desc 'tokenize NAME DATA_DIR', 'take the downloaded lyrics text files and tokenize them'
|
34
|
+
def tokenize(name, data_dir)
|
35
|
+
tokenizer = EnterRockstar::Corpus::Tokenizer.new(data_dir: data_dir, name: name)
|
36
36
|
tokenizer.tokenize
|
37
|
+
tokenizer.save_all
|
37
38
|
end
|
38
39
|
|
39
|
-
desc 'poetic NUMBER', 'generate a poetic representation of a number from the word base'
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
desc 'poetic NUMBER SOURCE_JSON', 'generate a poetic representation of a number from the word base'
|
41
|
+
option :amount, desc: 'how many number representations should be generated'
|
42
|
+
option :strategy, desc: 'generating strategy. One of: [random weighted]'
|
43
|
+
def poetic(number, source_json)
|
44
|
+
amount = options[:amount] || 5
|
45
|
+
strategy = options[:strategy] || 'random'
|
43
46
|
|
44
|
-
|
47
|
+
generator = EnterRockstar::Generator::Poetic.new(data_file: source_json, amount: amount, strategy: strategy)
|
48
|
+
results = generator.number(number)
|
49
|
+
|
50
|
+
say results.join("\n")
|
45
51
|
end
|
46
52
|
end
|
47
53
|
end
|
@@ -1,15 +1,66 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'whatlanguage'
|
4
|
+
require 'ruby-progressbar'
|
5
|
+
|
3
6
|
module EnterRockstar
|
4
7
|
module Corpus
|
5
8
|
# take the downloaded lyrics texts and tokenize them
|
6
9
|
class Tokenizer
|
7
|
-
def initialize(data_dir:
|
10
|
+
def initialize(data_dir:, name:)
|
8
11
|
@data_dir = data_dir
|
12
|
+
@stats = {}
|
13
|
+
@tokens = {}
|
14
|
+
@output_stats = "lyrics_data/#{name}_stats.json.gz"
|
15
|
+
@output_tokens = "lyrics_data/#{name}_tokens.json.gz"
|
16
|
+
@wl = WhatLanguage.new(:all)
|
9
17
|
end
|
10
18
|
|
11
19
|
def tokenize
|
20
|
+
text_files = Dir.glob("#{@data_dir}/**/*.txt")
|
21
|
+
puts "Parsing #{text_files.count} files."
|
22
|
+
progressbar = ProgressBar.create(title: 'Progress', total: text_files.count)
|
23
|
+
|
24
|
+
text_files.each do |filename|
|
25
|
+
# read the lyrics and tokenize the words
|
26
|
+
text = IO.read(filename)
|
27
|
+
|
28
|
+
# Rockstar doesn't really work well with languages other than English
|
29
|
+
if @wl.language(text) == :english
|
30
|
+
tokenized = _to_tokens(text)
|
31
|
+
# save stats which word appears after which one
|
32
|
+
n = 3
|
33
|
+
tokenized.each_cons(n) do |*head, continuation|
|
34
|
+
@stats[head] ||= Hash.new(0)
|
35
|
+
|
36
|
+
@stats[head][continuation] += 1
|
37
|
+
end
|
38
|
+
|
39
|
+
# save the words themselves based on what length they are
|
40
|
+
tokenized.each do |token|
|
41
|
+
next if token.length < 4 # shorter words are boring anyway
|
42
|
+
|
43
|
+
@tokens[token.length] ||= []
|
44
|
+
@tokens[token.length].push token unless @tokens[token.length].include? token
|
45
|
+
end
|
46
|
+
progressbar.increment
|
47
|
+
else
|
48
|
+
progressbar.increment
|
49
|
+
next
|
50
|
+
end
|
51
|
+
end
|
52
|
+
puts
|
53
|
+
end
|
54
|
+
|
55
|
+
def save_all
|
56
|
+
EnterRockstar::Utils.save_file(@output_tokens, @tokens.to_json)
|
57
|
+
EnterRockstar::Utils.save_file(@output_stats, @stats.to_json)
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
12
61
|
|
62
|
+
def _to_tokens(text)
|
63
|
+
text.downcase.split(/[^[[:alpha:]]]+/).reject(&:empty?).map(&:to_sym)
|
13
64
|
end
|
14
65
|
end
|
15
66
|
end
|
@@ -2,14 +2,49 @@
|
|
2
2
|
|
3
3
|
module EnterRockstar
|
4
4
|
module Generator
|
5
|
-
#
|
5
|
+
# poetic number generator
|
6
6
|
class Poetic
|
7
|
-
|
7
|
+
STRATEGIES = {
|
8
|
+
'random' => '_random'
|
9
|
+
}.freeze
|
8
10
|
|
11
|
+
attr_reader :tokens, :amount, :strategy
|
12
|
+
|
13
|
+
def initialize(data_file:, amount: 5, strategy: 'random')
|
14
|
+
@tokens = JSON.parse EnterRockstar::Utils.load_json(data_file)
|
15
|
+
@amount = Integer(amount) rescue 5
|
16
|
+
@strategy = STRATEGIES[strategy] || '_random'
|
9
17
|
end
|
10
18
|
|
11
|
-
def number(
|
19
|
+
def number(num)
|
20
|
+
# split the number into parts
|
21
|
+
array = num.to_s.split(/\B|\b/)
|
22
|
+
|
23
|
+
all_results = []
|
24
|
+
@amount.times do
|
25
|
+
result = send(@strategy, array)
|
26
|
+
all_results.push result.join(' ')
|
27
|
+
end
|
28
|
+
|
29
|
+
all_results
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def _random(array)
|
35
|
+
result = []
|
36
|
+
array.each do |digit|
|
37
|
+
if digit == '.'
|
38
|
+
result << '.'
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
42
|
+
# digits less than 4 should use longer words
|
43
|
+
digit = digit.to_i < 4 ? (digit.to_i + 10).to_s : digit
|
44
|
+
result << @tokens[digit].sample
|
45
|
+
end
|
12
46
|
|
47
|
+
result
|
13
48
|
end
|
14
49
|
end
|
15
50
|
end
|
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'json'
|
6
5
|
|
7
6
|
module EnterRockstar
|
8
7
|
module Scraper
|
@@ -12,18 +11,18 @@ module EnterRockstar
|
|
12
11
|
DATA_DIR = 'lyrics'
|
13
12
|
SLEEP_BETWEEN_REQUESTS = 0.1
|
14
13
|
|
15
|
-
attr_reader :tree
|
14
|
+
attr_reader :tree, :url, :category_name, :output
|
16
15
|
|
17
16
|
def initialize(category_name: 'heavy_metal', url: '/wiki/Category:Genre/Heavy_Metal', data_dir: 'lyrics_data')
|
18
17
|
@tree = {}
|
19
|
-
@output = "#{data_dir}/wikia_#{category_name}.json"
|
18
|
+
@output = "#{data_dir}/wikia_#{category_name}.json.gz"
|
20
19
|
@url = url
|
21
20
|
@category_name = category_name
|
22
21
|
end
|
23
22
|
|
24
23
|
def parse_category(url: nil, test_limit: false)
|
25
24
|
url ||= START_HOST + @url
|
26
|
-
html = URI.
|
25
|
+
html = URI.parse(url).open
|
27
26
|
doc = Nokogiri::HTML(html)
|
28
27
|
|
29
28
|
# get all category member links and sort them by band and album
|
@@ -49,17 +48,11 @@ module EnterRockstar
|
|
49
48
|
end
|
50
49
|
|
51
50
|
def save_category
|
52
|
-
|
53
|
-
out = File.new(@output, 'w')
|
54
|
-
out.write @tree.to_json
|
55
|
-
out.close
|
56
|
-
puts "Saved JSON data to #{@output}"
|
51
|
+
EnterRockstar::Utils.save_file(@output, @tree.to_json)
|
57
52
|
end
|
58
53
|
|
59
54
|
def load_saved_json
|
60
|
-
|
61
|
-
@tree = JSON.parse(file)
|
62
|
-
@new_tree = JSON.parse(file)
|
55
|
+
@tree = JSON.parse(EnterRockstar::Utils.load_json(@output))
|
63
56
|
end
|
64
57
|
|
65
58
|
def print_indexed_tree
|
@@ -78,24 +71,23 @@ module EnterRockstar
|
|
78
71
|
dirname = k == 'band_url' ? [DATA_DIR, @category_name, key].join('/') : [DATA_DIR, @category_name, key, k].join('/')
|
79
72
|
FileUtils.mkdir_p dirname
|
80
73
|
|
81
|
-
parse_page(v, dirname
|
74
|
+
parse_page(v, dirname)
|
82
75
|
end
|
83
76
|
end
|
84
|
-
|
85
|
-
@tree = @new_tree
|
86
|
-
save_category
|
87
77
|
end
|
88
78
|
|
89
|
-
def parse_page(url, dirname
|
90
|
-
puts url
|
79
|
+
def parse_page(url, dirname)
|
91
80
|
sleep SLEEP_BETWEEN_REQUESTS
|
92
|
-
html = URI.
|
81
|
+
html = URI.parse(START_HOST + url).open
|
93
82
|
doc = Nokogiri::HTML(html)
|
94
83
|
|
95
84
|
if doc.css('h2 span.mw-headline a').count.zero?
|
96
85
|
# single album page listed on the category
|
97
86
|
doc.css('div.mw-content-text ol li a').each do |song|
|
98
|
-
|
87
|
+
next unless song&.attr('href')
|
88
|
+
|
89
|
+
lyrics = parse_song(song.attr('href'), dirname, song.text)
|
90
|
+
save_song("#{dirname}/#{song.text}.txt", lyrics) unless lyrics.nil?
|
99
91
|
end
|
100
92
|
puts
|
101
93
|
else
|
@@ -104,11 +96,13 @@ module EnterRockstar
|
|
104
96
|
# some band pages have extra albums that are not listed in the category page for some reason
|
105
97
|
album_dirname = [dirname, album.text].join('/')
|
106
98
|
FileUtils.mkdir_p album_dirname
|
107
|
-
@new_tree[band][album.text] = album.attr('href')
|
108
99
|
|
109
100
|
# get song pages
|
110
101
|
album.parent.parent.css('+ div + ol > li a').each do |song|
|
111
|
-
|
102
|
+
next unless song&.attr('href')
|
103
|
+
|
104
|
+
lyrics = parse_song(song.attr('href'), album_dirname, song.text)
|
105
|
+
save_song("#{album_dirname}/#{song.text}.txt", lyrics) unless lyrics.nil?
|
112
106
|
end
|
113
107
|
puts
|
114
108
|
end
|
@@ -126,21 +120,18 @@ module EnterRockstar
|
|
126
120
|
|
127
121
|
print '.'
|
128
122
|
sleep SLEEP_BETWEEN_REQUESTS
|
129
|
-
html = URI.
|
123
|
+
html = URI.parse(START_HOST + url).open
|
130
124
|
doc = Nokogiri::HTML(html)
|
131
125
|
|
132
126
|
lyrics = doc.css('div.lyricbox').first
|
133
127
|
return if lyrics.nil?
|
128
|
+
return if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental'
|
134
129
|
|
135
|
-
|
136
|
-
|
137
|
-
else
|
138
|
-
proper_text = lyrics.inner_html.gsub(%r{<div.*?(\/div>)}, '').split('<br>').join("\n")
|
130
|
+
lyrics.inner_html.split('<br>').join("\n").gsub(%r{<\/?[^>]*>}, '')
|
131
|
+
end
|
139
132
|
|
140
|
-
|
141
|
-
|
142
|
-
out.close
|
143
|
-
end
|
133
|
+
def save_song(songfile, contents)
|
134
|
+
EnterRockstar::Utils.save_plain(songfile, contents)
|
144
135
|
end
|
145
136
|
end
|
146
137
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
module EnterRockstar
|
6
|
+
# shared utility code for different modules
|
7
|
+
class Utils
|
8
|
+
def self.load_json(file)
|
9
|
+
if File.exist?(file) && file.end_with?('.gz')
|
10
|
+
data = Zlib::GzipReader.new(StringIO.new(IO.read(file))).read
|
11
|
+
elsif File.exist? file.sub('.gz', '')
|
12
|
+
data = IO.read(file.sub('.gz', ''))
|
13
|
+
else
|
14
|
+
raise IOError, "File not found: #{file}"
|
15
|
+
end
|
16
|
+
|
17
|
+
data
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.save_file(filename, contents)
|
21
|
+
File.open(filename, 'w') do |f|
|
22
|
+
gz = Zlib::GzipWriter.new(f)
|
23
|
+
gz.write contents
|
24
|
+
gz.close
|
25
|
+
end
|
26
|
+
puts "Saved as #{filename}"
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.save_plain(filename, contents)
|
30
|
+
File.open(filename, 'w') do |f|
|
31
|
+
f.write contents
|
32
|
+
end
|
33
|
+
puts "Saved as #{filename}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/enter_rockstar.rb
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'json'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
require 'enter_rockstar/utils'
|
3
7
|
require 'enter_rockstar/scraper/wikia'
|
8
|
+
require 'enter_rockstar/corpus/tokenizer'
|
9
|
+
require 'enter_rockstar/generator/poetic'
|
4
10
|
require 'pry'
|
5
11
|
|
6
12
|
module EnterRockstar
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|