twitter_ebooks 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitter_ebooks (2.0.7)
4
+ twitter_ebooks (2.1.1)
5
5
  engtagger
6
6
  fast-stemmer
7
7
  gingerice
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # twitter\_ebooks 2.1.0
1
+ # twitter\_ebooks 2.1.1
2
2
 
3
3
  Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
4
4
 
data/bin/ebooks CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'twitter_ebooks'
4
4
 
5
+ $debug = true
6
+
5
7
  module Ebooks
6
8
  APP_PATH = Dir.pwd # XXX do some recursive thing instead
7
9
 
@@ -1,5 +1,7 @@
1
1
  gem 'minitest'
2
2
 
3
+ $debug = false
4
+
3
5
  def log(*args)
4
6
  STDERR.puts args.map(&:to_s).join(' ')
5
7
  STDERR.flush
@@ -7,7 +7,7 @@ require 'digest/md5'
7
7
 
8
8
  module Ebooks
9
9
  class Model
10
- attr_accessor :hash, :sentences, :generator, :keywords
10
+ attr_accessor :hash, :sentences, :mentions, :keywords
11
11
 
12
12
  def self.consume(txtpath)
13
13
  Model.new.consume(txtpath)
@@ -22,23 +22,44 @@ module Ebooks
22
22
  @hash = Digest::MD5.hexdigest(File.read(txtpath))
23
23
 
24
24
  text = File.read(txtpath)
25
- log "Removing commented lines and mention tokens"
25
+ log "Removing commented lines and sorting mentions"
26
26
 
27
27
  lines = text.split("\n")
28
28
  keeping = []
29
+ mentions = []
29
30
  lines.each do |l|
30
- next if l.start_with?('#') || l.include?('RT')
31
- processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
32
- keeping << processed.join(' ')
31
+ next if l.start_with?('#') # Remove commented lines
32
+ next if l.include?('RT') || l.include?('MT') # Remove soft retweets
33
+
34
+ if l.include?('@')
35
+ mentions << l
36
+ else
37
+ keeping << l
38
+ end
33
39
  end
34
- text = NLP.normalize(keeping.join("\n"))
40
+ text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
41
+ mention_text = NLP.normalize(mentions.join("\n"))
35
42
 
36
43
  log "Segmenting text into sentences"
37
44
 
38
- sentences = NLP.sentences(text)
45
+ statements = NLP.sentences(text)
46
+ mentions = NLP.sentences(mention_text)
47
+
48
+ log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
49
+ @sentences = []
50
+ @mentions = []
39
51
 
40
- log "Tokenizing #{sentences.length} sentences"
41
- @sentences = sentences.map { |sent| NLP.tokenize(sent) }
52
+ statements.each do |s|
53
+ @sentences << NLP.tokenize(s).reject do |t|
54
+ t.start_with?('@') || t.start_with?('http')
55
+ end
56
+ end
57
+
58
+ mentions.each do |s|
59
+ @mentions << NLP.tokenize(s).reject do |t|
60
+ t.start_with?('@') || t.start_with?('http')
61
+ end
62
+ end
42
63
 
43
64
  log "Ranking keywords"
44
65
  @keywords = NLP.keywords(@sentences)
@@ -72,38 +93,55 @@ module Ebooks
72
93
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
73
94
  end
74
95
 
75
- def make_statement(limit=140, generator=nil)
96
+ def make_statement(limit=140, generator=nil, retry_limit=10)
76
97
  responding = !generator.nil?
77
98
  generator ||= SuffixGenerator.build(@sentences)
99
+
100
+ retries = 0
78
101
  tweet = ""
79
102
 
80
103
  while (tokens = generator.generate(3, :bigrams)) do
81
104
  next if tokens.length <= 3 && !responding
82
105
  break if valid_tweet?(tokens, limit)
106
+
107
+ retries += 1
108
+ break if retries >= retry_limit
83
109
  end
84
110
 
85
- if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
111
+ if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
86
112
  while (tokens = generator.generate(3, :unigrams)) do
87
- break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
113
+ break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
114
+
115
+ retries += 1
116
+ break if retries >= retry_limit
88
117
  end
89
118
  end
90
119
 
91
120
  tweet = NLP.reconstruct(tokens)
92
121
 
122
+ if retries >= retry_limit
123
+ log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
124
+ end
125
+
93
126
  fix tweet
94
127
  end
95
128
 
129
+ # Test if a sentence has been copied verbatim from original
130
+ def verbatim?(tokens)
131
+ @sentences.include?(tokens) || @mentions.include?(tokens)
132
+ end
133
+
96
134
  # Finds all relevant tokenized sentences to given input by
97
135
  # comparing non-stopword token overlaps
98
- def relevant_sentences(input)
136
+ def find_relevant(sentences, input)
99
137
  relevant = []
100
138
  slightly_relevant = []
101
139
 
102
- tokenized = NLP.tokenize(input)
140
+ tokenized = NLP.tokenize(input).map(&:downcase)
103
141
 
104
- @sentences.each do |sent|
142
+ sentences.each do |sent|
105
143
  tokenized.each do |token|
106
- if sent.include?(token)
144
+ if sent.map(&:downcase).include?(token)
107
145
  relevant << sent unless NLP.stopword?(token)
108
146
  slightly_relevant << sent
109
147
  end
@@ -115,9 +153,9 @@ module Ebooks
115
153
 
116
154
  # Generates a response by looking for related sentences
117
155
  # in the corpus and building a smaller generator from these
118
- def make_response(input, limit=140)
119
- # First try
120
- relevant, slightly_relevant = relevant_sentences(input)
156
+ def make_response(input, limit=140, sentences=@mentions)
157
+ # Prefer mentions
158
+ relevant, slightly_relevant = find_relevant(sentences, input)
121
159
 
122
160
  if relevant.length >= 3
123
161
  generator = SuffixGenerator.build(relevant)
@@ -125,6 +163,8 @@ module Ebooks
125
163
  elsif slightly_relevant.length >= 5
126
164
  generator = SuffixGenerator.build(slightly_relevant)
127
165
  make_statement(limit, generator)
166
+ elsif sentences.equal?(@mentions)
167
+ make_response(input, limit, @sentences)
128
168
  else
129
169
  make_statement(limit)
130
170
  end
@@ -44,7 +44,7 @@ module Ebooks
44
44
  verbatim = [tokens] # Verbatim sentences to avoid reproducing
45
45
 
46
46
  0.upto(passes-1) do
47
- puts NLP.reconstruct(tokens)
47
+ log NLP.reconstruct(tokens) if $debug
48
48
  varsites = {} # Map bigram start site => next token alternatives
49
49
 
50
50
  tokens.each_with_index do |token, i|
@@ -1,3 +1,3 @@
1
1
  module Ebooks
2
- VERSION = "2.1.0"
2
+ VERSION = "2.1.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_ebooks
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-16 00:00:00.000000000 Z
12
+ date: 2013-11-21 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest