markovian 0.2.0 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f03bf82f92106eab96196be22cb2e09eb3d022b
4
- data.tar.gz: fe37c3945b518f64d617be2758aef29a5dbfc52a
3
+ metadata.gz: 3c30d2a0dcf0c8488bab6aa456ecd5a761fa04ae
4
+ data.tar.gz: c4900a0c75636c29e33697d9ab3bcff85bb13828
5
5
  SHA512:
6
- metadata.gz: 219a602a1d41dc3aaff820ad616d0690b0a42810f40e83e08a219537f6ba765855b2b8008e4af3fe0537570040d86b0f499222a3a9e5815a44a827dfadfe418e
7
- data.tar.gz: d00c3cf9bc069f775edf0b496b5ccf79800129a50dbe3d0069b4401294393561cedbb9f3606a855ffa6081e65cf0cd3272bf3861f777d4cd8883dc9353a9a9e7
6
+ metadata.gz: 01e05e4c15feb7a2938603a6ef7fda354871d1474e493701475d5f1600498633c525982b9ca4b210b22303b6d18effe0c1d29412ee5fb201218cf4a0bd8f2a83
7
+ data.tar.gz: 99ff58a5fa7b60a0bfdd69cf450ba650e4958e6047ed18dd07ea8c6e211974c696d87fb671a46103257935b7d36d60fe7d6b25f9ac21bf7c8f387e5e8c4d6066
data/README.md CHANGED
@@ -35,6 +35,13 @@ After checking out the repo, run `bin/setup` to install dependencies. You can al
35
35
 
36
36
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
37
37
 
38
+ ## Related Projects
39
+
40
+ * [tokeneyes](https://github.com/arsduo/tokeneyes): a tokenizing system used by Markovian that
41
+ provides info on punctuation and sentence flow (to be incorporated shortly)
42
+ * [markovian-lambda](https://github.com/arsduo/markovian-lambda): an inchoate set of utility
43
+ classes for using Markovian, eventually with AWS Lambda.
44
+
38
45
  ## Contributing
39
46
 
40
47
  Bug reports and pull requests are welcome on GitHub at https://github.com/arsduo/markovian. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
@@ -42,4 +49,3 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/arsduo
42
49
  ## License
43
50
 
44
51
  The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
45
-
data/changelog.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 0.2.9
4
+
5
+ Internal refactors only, no new functionality.
6
+
7
+ * Refactor Dictionary to use DictionaryEntry objects, which can store additional metadata
8
+ * Use Tokeneyes to parse strings rather than the original String#split-based TextSplitter
9
+
3
10
  ## 0.2.0
4
11
 
5
12
  * Rename Chainset/Chain to Corpus (better name is better)
@@ -11,7 +11,6 @@ module Markovian
11
11
  @two_key_dictionary = Dictionary.new
12
12
  end
13
13
 
14
- attr_reader :one_key_dictionary, :two_key_dictionary
15
14
  def lengthen(word, next_word:, previous_word: nil)
16
15
  @one_key_dictionary.push(word, next_word)
17
16
  @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
@@ -33,6 +32,9 @@ module Markovian
33
32
 
34
33
  protected
35
34
 
35
+ # for equality checking
36
+ attr_reader :one_key_dictionary, :two_key_dictionary
37
+
36
38
  def result_for_two_words(previous_word, word)
37
39
  @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
38
40
  end
@@ -49,4 +51,3 @@ module Markovian
49
51
  end
50
52
  end
51
53
  end
52
-
@@ -1,17 +1,21 @@
1
- # This class represents a dictionary of words or phrases and the various words that can follow
2
- # them. Currently it's implemented as a hash of arrays, but more advanced representations may
3
- # follow.
1
+ require 'markovian/corpus/dictionary_entry'
4
2
  #
5
- # The key is an opaque value, which could represent either a single word or a phrase as desired.
3
+ # This class represents a dictionary of words or phrases and the various words that can follow
4
+ # them. The key is an opaque value, which could represent either a single word or a phrase as desired.
6
5
  module Markovian
7
6
  class Corpus
8
7
  class Dictionary
9
- def push(key, word)
10
- dictionary[key] += [word]
8
+ def push(key, word, direction: :forwards)
9
+ # Incoming we get a Tokeneyes::Word object
10
+ dictionary[key.to_s].push(word, direction: direction)
11
11
  end
12
12
 
13
13
  def next_word(key)
14
- dictionary[key].sample
14
+ dictionary[key].next_word
15
+ end
16
+
17
+ def previous_word(key)
18
+ dictionary[key].previous_word
15
19
  end
16
20
 
17
21
  def random_word
@@ -32,7 +36,9 @@ module Markovian
32
36
  protected
33
37
 
34
38
  def dictionary
35
- @dictionary ||= Hash.new([])
39
+ # We have to set the value of the hash in the block, otherwise it doesn't actually seem to
40
+ # get saved properly. Default hash values behave weirdly in general.
41
+ @dictionary ||= Hash.new {|hash, key| hash[key] = DictionaryEntry.new(key)}
36
42
  end
37
43
  end
38
44
  end
@@ -0,0 +1,54 @@
1
+ module Markovian
2
+ class Corpus
3
+ class DictionaryEntry
4
+ attr_reader :word, :count
5
+ def initialize(word)
6
+ @word = word
7
+ @next_words = []
8
+ @previous_words = []
9
+ @count = 0
10
+ end
11
+
12
+ def push(word, direction: :forwards)
13
+ # The incoming word will be a Tokeneyes::Word object
14
+ array_for_direction(direction) << word.to_s
15
+ # we don't want to double-count words if we read the text both forward and backward, so
16
+ # only count in the forward direction. (If we encounter a scenario where someone only wants
17
+ # to read in the backward direction, we can deal with that then.)
18
+ @count += 1 if direction == :forwards
19
+ end
20
+
21
+ def next_word
22
+ next_words.sample
23
+ end
24
+
25
+ def previous_word
26
+ previous_words.sample
27
+ end
28
+
29
+ def ==(other)
30
+ self.word == other.word &&
31
+ self.next_words == other.next_words &&
32
+ self.previous_words == other.previous_words
33
+ end
34
+
35
+ protected
36
+
37
+ # for equality checking
38
+ attr_reader :next_words, :previous_words
39
+
40
+ VALID_DIRECTIONS = [:backwards, :forwards]
41
+
42
+ def array_for_direction(direction)
43
+ validate_direction(direction)
44
+ direction == :backwards ? previous_words : next_words
45
+ end
46
+
47
+ def validate_direction(direction)
48
+ unless VALID_DIRECTIONS.include?(direction)
49
+ raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -58,7 +58,8 @@ module Markovian
58
58
  end
59
59
 
60
60
  def split_seed_text(seed_text)
61
- Utils::TextSplitter.new(seed_text).components
61
+ # We get back Tokeneyes::Word objects, but for now only care about the strings within
62
+ Utils::TextSplitter.new(seed_text).components.map(&:to_s)
62
63
  end
63
64
  end
64
65
  end
@@ -1,3 +1,5 @@
1
+ require 'tokeneyes'
2
+
1
3
  module Markovian
2
4
  module Utils
3
5
  class TextSplitter
@@ -6,29 +8,17 @@ module Markovian
6
8
  @text = text
7
9
  end
8
10
 
9
- # We split on spaces, quotes, (various symbols followed by either another dash, a space,
10
- # another dot, or the end of the text), or (colons preceded by space or the beginning of the
11
- # text).
12
- # We don't want to block things like Jones-Smith, tl;dr, abc.def, or it's.
13
- # Any of the following:
14
- # [\s\(\)] - a space or parentheses on their own
15
- # " - a quote on its own
16
- # [\.-:;\?\!]([-\.\s]|$) - a period, dash, ?, or ! followed by a space, period, dash, or the
17
- # end of the word
18
- # [\s^]' - a single ' following a non-letter
19
- WORD_DELIMITERS = /([\s\(\)]|"|[\.\-:;\?\!]([\-\.\s]|$)|[\s^]')/
20
-
21
11
  # anything that doesn't contain any letters is not a word we need to care about
22
12
  MARKERS_OF_INTEREST = /[A-Za-z]/
23
13
 
24
14
  def components
25
- split_text.select {|t| t.match(MARKERS_OF_INTEREST)}
15
+ split_text.select {|w| w.text.match(MARKERS_OF_INTEREST)}
26
16
  end
27
17
 
28
18
  protected
29
19
 
30
20
  def split_text
31
- text.downcase.split(WORD_DELIMITERS)
21
+ Tokeneyes::Tokenizer.new(text.downcase).parse_into_words
32
22
  end
33
23
  end
34
24
  end
@@ -1,3 +1,3 @@
1
1
  module Markovian
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.9"
3
3
  end
data/markovian.gemspec CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
+ spec.add_runtime_dependency "tokeneyes", "~> 0.1.0"
22
23
  spec.add_development_dependency "bundler", "~> 1.7"
23
24
  spec.add_development_dependency "rake", "~> 10.0"
24
25
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markovian
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Koppel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-14 00:00:00.000000000 Z
11
+ date: 2015-09-28 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: tokeneyes
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.0
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -65,6 +79,7 @@ files:
65
79
  - lib/markovian/corpus/chain.rb
66
80
  - lib/markovian/corpus/compiler.rb
67
81
  - lib/markovian/corpus/dictionary.rb
82
+ - lib/markovian/corpus/dictionary_entry.rb
68
83
  - lib/markovian/importers/twitter/csv_importer.rb
69
84
  - lib/markovian/importers/twitter/tweet.rb
70
85
  - lib/markovian/text_builder.rb