markovian 0.2.0 → 0.2.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f03bf82f92106eab96196be22cb2e09eb3d022b
4
- data.tar.gz: fe37c3945b518f64d617be2758aef29a5dbfc52a
3
+ metadata.gz: 3c30d2a0dcf0c8488bab6aa456ecd5a761fa04ae
4
+ data.tar.gz: c4900a0c75636c29e33697d9ab3bcff85bb13828
5
5
  SHA512:
6
- metadata.gz: 219a602a1d41dc3aaff820ad616d0690b0a42810f40e83e08a219537f6ba765855b2b8008e4af3fe0537570040d86b0f499222a3a9e5815a44a827dfadfe418e
7
- data.tar.gz: d00c3cf9bc069f775edf0b496b5ccf79800129a50dbe3d0069b4401294393561cedbb9f3606a855ffa6081e65cf0cd3272bf3861f777d4cd8883dc9353a9a9e7
6
+ metadata.gz: 01e05e4c15feb7a2938603a6ef7fda354871d1474e493701475d5f1600498633c525982b9ca4b210b22303b6d18effe0c1d29412ee5fb201218cf4a0bd8f2a83
7
+ data.tar.gz: 99ff58a5fa7b60a0bfdd69cf450ba650e4958e6047ed18dd07ea8c6e211974c696d87fb671a46103257935b7d36d60fe7d6b25f9ac21bf7c8f387e5e8c4d6066
data/README.md CHANGED
@@ -35,6 +35,13 @@ After checking out the repo, run `bin/setup` to install dependencies. You can al
35
35
 
36
36
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
37
37
 
38
+ ## Related Projects
39
+
40
+ * [tokeneyes](https://github.com/arsduo/tokeneyes): a tokenizing system used by Markovian that
41
+ provides info on punctuation and sentence flow (to be incorporated shortly)
42
+ * [markovian-lambda](https://github.com/arsduo/markovian-lambda): an inchoate set of utility
43
+ classes for using Markovian, eventually with AWS Lambda.
44
+
38
45
  ## Contributing
39
46
 
40
47
  Bug reports and pull requests are welcome on GitHub at https://github.com/arsduo/markovian. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
@@ -42,4 +49,3 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/arsduo
42
49
  ## License
43
50
 
44
51
  The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
45
-
data/changelog.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 0.2.9
4
+
5
+ Internal refactors only, no new functionality.
6
+
7
+ * Refactor Dictionary to use DictionaryEntry objects, which can store additional metadata
8
+ * Use Tokeneyes to parse strings rather than the original String#split-based TextSplitter
9
+
3
10
  ## 0.2.0
4
11
 
5
12
  * Rename Chainset/Chain to Corpus (better name is better)
@@ -11,7 +11,6 @@ module Markovian
11
11
  @two_key_dictionary = Dictionary.new
12
12
  end
13
13
 
14
- attr_reader :one_key_dictionary, :two_key_dictionary
15
14
  def lengthen(word, next_word:, previous_word: nil)
16
15
  @one_key_dictionary.push(word, next_word)
17
16
  @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
@@ -33,6 +32,9 @@ module Markovian
33
32
 
34
33
  protected
35
34
 
35
+ # for equality checking
36
+ attr_reader :one_key_dictionary, :two_key_dictionary
37
+
36
38
  def result_for_two_words(previous_word, word)
37
39
  @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
38
40
  end
@@ -49,4 +51,3 @@ module Markovian
49
51
  end
50
52
  end
51
53
  end
52
-
@@ -1,17 +1,21 @@
1
- # This class represents a dictionary of words or phrases and the various words that can follow
2
- # them. Currently it's implemented as a hash of arrays, but more advanced representations may
3
- # follow.
1
+ require 'markovian/corpus/dictionary_entry'
4
2
  #
5
- # The key is an opaque value, which could represent either a single word or a phrase as desired.
3
+ # This class represents a dictionary of words or phrases and the various words that can follow
4
+ # them. The key is an opaque value, which could represent either a single word or a phrase as desired.
6
5
  module Markovian
7
6
  class Corpus
8
7
  class Dictionary
9
- def push(key, word)
10
- dictionary[key] += [word]
8
+ def push(key, word, direction: :forwards)
9
+ # Incoming we get a Tokeneyes::Word object
10
+ dictionary[key.to_s].push(word, direction: direction)
11
11
  end
12
12
 
13
13
  def next_word(key)
14
- dictionary[key].sample
14
+ dictionary[key].next_word
15
+ end
16
+
17
+ def previous_word(key)
18
+ dictionary[key].previous_word
15
19
  end
16
20
 
17
21
  def random_word
@@ -32,7 +36,9 @@ module Markovian
32
36
  protected
33
37
 
34
38
  def dictionary
35
- @dictionary ||= Hash.new([])
39
+ # We have to set the value of the hash in the block, otherwise it doesn't actually seem to
40
+ # get saved properly. Default hash values behave weirdly in general.
41
+ @dictionary ||= Hash.new {|hash, key| hash[key] = DictionaryEntry.new(key)}
36
42
  end
37
43
  end
38
44
  end
@@ -0,0 +1,54 @@
1
+ module Markovian
2
+ class Corpus
3
+ class DictionaryEntry
4
+ attr_reader :word, :count
5
+ def initialize(word)
6
+ @word = word
7
+ @next_words = []
8
+ @previous_words = []
9
+ @count = 0
10
+ end
11
+
12
+ def push(word, direction: :forwards)
13
+ # The incoming word will be a Tokeneyes::Word object
14
+ array_for_direction(direction) << word.to_s
15
+ # we don't want to double-count words if we read the text both forward and backward, so
16
+ # only count in the forward direction. (If we encounter a scenario where someone only wants
17
+ # to read in the backward direction, we can deal with that then.)
18
+ @count += 1 if direction == :forwards
19
+ end
20
+
21
+ def next_word
22
+ next_words.sample
23
+ end
24
+
25
+ def previous_word
26
+ previous_words.sample
27
+ end
28
+
29
+ def ==(other)
30
+ self.word == other.word &&
31
+ self.next_words == other.next_words &&
32
+ self.previous_words == other.previous_words
33
+ end
34
+
35
+ protected
36
+
37
+ # for equality checking
38
+ attr_reader :next_words, :previous_words
39
+
40
+ VALID_DIRECTIONS = [:backwards, :forwards]
41
+
42
+ def array_for_direction(direction)
43
+ validate_direction(direction)
44
+ direction == :backwards ? previous_words : next_words
45
+ end
46
+
47
+ def validate_direction(direction)
48
+ unless VALID_DIRECTIONS.include?(direction)
49
+ raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -58,7 +58,8 @@ module Markovian
58
58
  end
59
59
 
60
60
  def split_seed_text(seed_text)
61
- Utils::TextSplitter.new(seed_text).components
61
+ # We get back Tokeneyes::Word objects, but for now only care about the strings within
62
+ Utils::TextSplitter.new(seed_text).components.map(&:to_s)
62
63
  end
63
64
  end
64
65
  end
@@ -1,3 +1,5 @@
1
+ require 'tokeneyes'
2
+
1
3
  module Markovian
2
4
  module Utils
3
5
  class TextSplitter
@@ -6,29 +8,17 @@ module Markovian
6
8
  @text = text
7
9
  end
8
10
 
9
- # We split on spaces, quotes, (various symbols followed by either another dash, a space,
10
- # another dot, or the end of the text), or (colons preceded by space or the beginning of the
11
- # text).
12
- # We don't want to block things like Jones-Smith, tl;dr, abc.def, or it's.
13
- # Any of the following:
14
- # [\s\(\)] - a space or parentheses on their own
15
- # " - a quote on its own
16
- # [\.-:;\?\!]([-\.\s]|$) - a period, dash, ?, or ! followed by a space, period, dash, or the
17
- # end of the word
18
- # [\s^]' - a single ' following a non-letter
19
- WORD_DELIMITERS = /([\s\(\)]|"|[\.\-:;\?\!]([\-\.\s]|$)|[\s^]')/
20
-
21
11
  # anything that doesn't contain any letters is not a word we need to care about
22
12
  MARKERS_OF_INTEREST = /[A-Za-z]/
23
13
 
24
14
  def components
25
- split_text.select {|t| t.match(MARKERS_OF_INTEREST)}
15
+ split_text.select {|w| w.text.match(MARKERS_OF_INTEREST)}
26
16
  end
27
17
 
28
18
  protected
29
19
 
30
20
  def split_text
31
- text.downcase.split(WORD_DELIMITERS)
21
+ Tokeneyes::Tokenizer.new(text.downcase).parse_into_words
32
22
  end
33
23
  end
34
24
  end
@@ -1,3 +1,3 @@
1
1
  module Markovian
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.9"
3
3
  end
data/markovian.gemspec CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
+ spec.add_runtime_dependency "tokeneyes", "~> 0.1.0"
22
23
  spec.add_development_dependency "bundler", "~> 1.7"
23
24
  spec.add_development_dependency "rake", "~> 10.0"
24
25
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markovian
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Koppel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-14 00:00:00.000000000 Z
11
+ date: 2015-09-28 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: tokeneyes
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.0
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -65,6 +79,7 @@ files:
65
79
  - lib/markovian/corpus/chain.rb
66
80
  - lib/markovian/corpus/compiler.rb
67
81
  - lib/markovian/corpus/dictionary.rb
82
+ - lib/markovian/corpus/dictionary_entry.rb
68
83
  - lib/markovian/importers/twitter/csv_importer.rb
69
84
  - lib/markovian/importers/twitter/tweet.rb
70
85
  - lib/markovian/text_builder.rb