markovian 0.2.0 → 0.2.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +7 -1
- data/changelog.md +7 -0
- data/lib/markovian/corpus/chain.rb +3 -2
- data/lib/markovian/corpus/dictionary.rb +14 -8
- data/lib/markovian/corpus/dictionary_entry.rb +54 -0
- data/lib/markovian/text_builder.rb +2 -1
- data/lib/markovian/utils/text_splitter.rb +4 -14
- data/lib/markovian/version.rb +1 -1
- data/markovian.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c30d2a0dcf0c8488bab6aa456ecd5a761fa04ae
|
4
|
+
data.tar.gz: c4900a0c75636c29e33697d9ab3bcff85bb13828
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01e05e4c15feb7a2938603a6ef7fda354871d1474e493701475d5f1600498633c525982b9ca4b210b22303b6d18effe0c1d29412ee5fb201218cf4a0bd8f2a83
|
7
|
+
data.tar.gz: 99ff58a5fa7b60a0bfdd69cf450ba650e4958e6047ed18dd07ea8c6e211974c696d87fb671a46103257935b7d36d60fe7d6b25f9ac21bf7c8f387e5e8c4d6066
|
data/README.md
CHANGED
@@ -35,6 +35,13 @@ After checking out the repo, run `bin/setup` to install dependencies. You can al
|
|
35
35
|
|
36
36
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
37
37
|
|
38
|
+
## Related Projects
|
39
|
+
|
40
|
+
* [tokeneyes](https://github.com/arsduo/tokeneyes): a tokenizing system used by Markovian that
|
41
|
+
provides info on punctuation and sentence flow (to be incorporated shortly)
|
42
|
+
* [markovian-lambda](https://github.com/arsduo/markovian-lambda): an inchoate set of utility
|
43
|
+
classes for using Markovian, eventually with AWS Lambda.
|
44
|
+
|
38
45
|
## Contributing
|
39
46
|
|
40
47
|
Bug reports and pull requests are welcome on GitHub at https://github.com/arsduo/markovian. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
|
@@ -42,4 +49,3 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/arsduo
|
|
42
49
|
## License
|
43
50
|
|
44
51
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
45
|
-
|
data/changelog.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## 0.2.9
|
4
|
+
|
5
|
+
Internal refactors only, no new functionality.
|
6
|
+
|
7
|
+
* Refactor Dictionary to use DictionaryEntry objects, which can store additional metadata
|
8
|
+
* Use Tokeneyes to parse strings rather than the original String#split-based TextSplitter
|
9
|
+
|
3
10
|
## 0.2.0
|
4
11
|
|
5
12
|
* Rename Chainset/Chain to Corpus (better name is better)
|
@@ -11,7 +11,6 @@ module Markovian
|
|
11
11
|
@two_key_dictionary = Dictionary.new
|
12
12
|
end
|
13
13
|
|
14
|
-
attr_reader :one_key_dictionary, :two_key_dictionary
|
15
14
|
def lengthen(word, next_word:, previous_word: nil)
|
16
15
|
@one_key_dictionary.push(word, next_word)
|
17
16
|
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
@@ -33,6 +32,9 @@ module Markovian
|
|
33
32
|
|
34
33
|
protected
|
35
34
|
|
35
|
+
# for equality checking
|
36
|
+
attr_reader :one_key_dictionary, :two_key_dictionary
|
37
|
+
|
36
38
|
def result_for_two_words(previous_word, word)
|
37
39
|
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
38
40
|
end
|
@@ -49,4 +51,3 @@ module Markovian
|
|
49
51
|
end
|
50
52
|
end
|
51
53
|
end
|
52
|
-
|
@@ -1,17 +1,21 @@
|
|
1
|
-
|
2
|
-
# them. Currently it's implemented as a hash of arrays, but more advanced representations may
|
3
|
-
# follow.
|
1
|
+
require 'markovian/corpus/dictionary_entry'
|
4
2
|
#
|
5
|
-
#
|
3
|
+
# This class represents a dictionary of words or phrases and the various words that can follow
|
4
|
+
# them. The key is an opaque value, which could represent either a single word or a phrase as desired.
|
6
5
|
module Markovian
|
7
6
|
class Corpus
|
8
7
|
class Dictionary
|
9
|
-
def push(key, word)
|
10
|
-
|
8
|
+
def push(key, word, direction: :forwards)
|
9
|
+
# Incoming we get a Tokeneyes::Word object
|
10
|
+
dictionary[key.to_s].push(word, direction: direction)
|
11
11
|
end
|
12
12
|
|
13
13
|
def next_word(key)
|
14
|
-
dictionary[key].
|
14
|
+
dictionary[key].next_word
|
15
|
+
end
|
16
|
+
|
17
|
+
def previous_word(key)
|
18
|
+
dictionary[key].previous_word
|
15
19
|
end
|
16
20
|
|
17
21
|
def random_word
|
@@ -32,7 +36,9 @@ module Markovian
|
|
32
36
|
protected
|
33
37
|
|
34
38
|
def dictionary
|
35
|
-
|
39
|
+
# We have to set the value of the hash in the block, otherwise it doesn't actually seem to
|
40
|
+
# get saved properly. Default hash values behave weirdly in general.
|
41
|
+
@dictionary ||= Hash.new {|hash, key| hash[key] = DictionaryEntry.new(key)}
|
36
42
|
end
|
37
43
|
end
|
38
44
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Markovian
|
2
|
+
class Corpus
|
3
|
+
class DictionaryEntry
|
4
|
+
attr_reader :word, :count
|
5
|
+
def initialize(word)
|
6
|
+
@word = word
|
7
|
+
@next_words = []
|
8
|
+
@previous_words = []
|
9
|
+
@count = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def push(word, direction: :forwards)
|
13
|
+
# The incoming word will be a Tokeneyes::Word object
|
14
|
+
array_for_direction(direction) << word.to_s
|
15
|
+
# we don't want to double-count words if we read the text both forward and backward, so
|
16
|
+
# only count in the forward direction. (If we encounter a scenario where someone only wants
|
17
|
+
# to read in the backward direction, we can deal with that then.)
|
18
|
+
@count += 1 if direction == :forwards
|
19
|
+
end
|
20
|
+
|
21
|
+
def next_word
|
22
|
+
next_words.sample
|
23
|
+
end
|
24
|
+
|
25
|
+
def previous_word
|
26
|
+
previous_words.sample
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
self.word == other.word &&
|
31
|
+
self.next_words == other.next_words &&
|
32
|
+
self.previous_words == other.previous_words
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
# for equality checking
|
38
|
+
attr_reader :next_words, :previous_words
|
39
|
+
|
40
|
+
VALID_DIRECTIONS = [:backwards, :forwards]
|
41
|
+
|
42
|
+
def array_for_direction(direction)
|
43
|
+
validate_direction(direction)
|
44
|
+
direction == :backwards ? previous_words : next_words
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate_direction(direction)
|
48
|
+
unless VALID_DIRECTIONS.include?(direction)
|
49
|
+
raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -58,7 +58,8 @@ module Markovian
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def split_seed_text(seed_text)
|
61
|
-
|
61
|
+
# We get back Tokeneyes::Word objects, but for now only care about the strings within
|
62
|
+
Utils::TextSplitter.new(seed_text).components.map(&:to_s)
|
62
63
|
end
|
63
64
|
end
|
64
65
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'tokeneyes'
|
2
|
+
|
1
3
|
module Markovian
|
2
4
|
module Utils
|
3
5
|
class TextSplitter
|
@@ -6,29 +8,17 @@ module Markovian
|
|
6
8
|
@text = text
|
7
9
|
end
|
8
10
|
|
9
|
-
# We split on spaces, quotes, (various symbols followed by either another dash, a space,
|
10
|
-
# another dot, or the end of the text), or (colons preceded by space or the beginning of the
|
11
|
-
# text).
|
12
|
-
# We don't want to block things like Jones-Smith, tl;dr, abc.def, or it's.
|
13
|
-
# Any of the following:
|
14
|
-
# [\s\(\)] - a space or parentheses on their own
|
15
|
-
# " - a quote on its own
|
16
|
-
# [\.-:;\?\!]([-\.\s]|$) - a period, dash, ?, or ! followed by a space, period, dash, or the
|
17
|
-
# end of the word
|
18
|
-
# [\s^]' - a single ' following a non-letter
|
19
|
-
WORD_DELIMITERS = /([\s\(\)]|"|[\.\-:;\?\!]([\-\.\s]|$)|[\s^]')/
|
20
|
-
|
21
11
|
# anything that doesn't contain any letters is not a word we need to care about
|
22
12
|
MARKERS_OF_INTEREST = /[A-Za-z]/
|
23
13
|
|
24
14
|
def components
|
25
|
-
split_text.select {|
|
15
|
+
split_text.select {|w| w.text.match(MARKERS_OF_INTEREST)}
|
26
16
|
end
|
27
17
|
|
28
18
|
protected
|
29
19
|
|
30
20
|
def split_text
|
31
|
-
text.downcase.
|
21
|
+
Tokeneyes::Tokenizer.new(text.downcase).parse_into_words
|
32
22
|
end
|
33
23
|
end
|
34
24
|
end
|
data/lib/markovian/version.rb
CHANGED
data/markovian.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
|
+
spec.add_runtime_dependency "tokeneyes", "~> 0.1.0"
|
22
23
|
spec.add_development_dependency "bundler", "~> 1.7"
|
23
24
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
25
|
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: tokeneyes
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.1.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.1.0
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -65,6 +79,7 @@ files:
|
|
65
79
|
- lib/markovian/corpus/chain.rb
|
66
80
|
- lib/markovian/corpus/compiler.rb
|
67
81
|
- lib/markovian/corpus/dictionary.rb
|
82
|
+
- lib/markovian/corpus/dictionary_entry.rb
|
68
83
|
- lib/markovian/importers/twitter/csv_importer.rb
|
69
84
|
- lib/markovian/importers/twitter/tweet.rb
|
70
85
|
- lib/markovian/text_builder.rb
|