twitter_ebooks 2.1.0 → 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitter_ebooks (2.0.7)
4
+ twitter_ebooks (2.1.1)
5
5
  engtagger
6
6
  fast-stemmer
7
7
  gingerice
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # twitter\_ebooks 2.1.0
1
+ # twitter\_ebooks 2.1.1
2
2
 
3
3
  Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
4
4
 
data/bin/ebooks CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'twitter_ebooks'
4
4
 
5
+ $debug = true
6
+
5
7
  module Ebooks
6
8
  APP_PATH = Dir.pwd # XXX do some recursive thing instead
7
9
 
@@ -1,5 +1,7 @@
1
1
  gem 'minitest'
2
2
 
3
+ $debug = false
4
+
3
5
  def log(*args)
4
6
  STDERR.puts args.map(&:to_s).join(' ')
5
7
  STDERR.flush
@@ -7,7 +7,7 @@ require 'digest/md5'
7
7
 
8
8
  module Ebooks
9
9
  class Model
10
- attr_accessor :hash, :sentences, :generator, :keywords
10
+ attr_accessor :hash, :sentences, :mentions, :keywords
11
11
 
12
12
  def self.consume(txtpath)
13
13
  Model.new.consume(txtpath)
@@ -22,23 +22,44 @@ module Ebooks
22
22
  @hash = Digest::MD5.hexdigest(File.read(txtpath))
23
23
 
24
24
  text = File.read(txtpath)
25
- log "Removing commented lines and mention tokens"
25
+ log "Removing commented lines and sorting mentions"
26
26
 
27
27
  lines = text.split("\n")
28
28
  keeping = []
29
+ mentions = []
29
30
  lines.each do |l|
30
- next if l.start_with?('#') || l.include?('RT')
31
- processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
32
- keeping << processed.join(' ')
31
+ next if l.start_with?('#') # Remove commented lines
32
+ next if l.include?('RT') || l.include?('MT') # Remove soft retweets
33
+
34
+ if l.include?('@')
35
+ mentions << l
36
+ else
37
+ keeping << l
38
+ end
33
39
  end
34
- text = NLP.normalize(keeping.join("\n"))
40
+ text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
41
+ mention_text = NLP.normalize(mentions.join("\n"))
35
42
 
36
43
  log "Segmenting text into sentences"
37
44
 
38
- sentences = NLP.sentences(text)
45
+ statements = NLP.sentences(text)
46
+ mentions = NLP.sentences(mention_text)
47
+
48
+ log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
49
+ @sentences = []
50
+ @mentions = []
39
51
 
40
- log "Tokenizing #{sentences.length} sentences"
41
- @sentences = sentences.map { |sent| NLP.tokenize(sent) }
52
+ statements.each do |s|
53
+ @sentences << NLP.tokenize(s).reject do |t|
54
+ t.start_with?('@') || t.start_with?('http')
55
+ end
56
+ end
57
+
58
+ mentions.each do |s|
59
+ @mentions << NLP.tokenize(s).reject do |t|
60
+ t.start_with?('@') || t.start_with?('http')
61
+ end
62
+ end
42
63
 
43
64
  log "Ranking keywords"
44
65
  @keywords = NLP.keywords(@sentences)
@@ -72,38 +93,55 @@ module Ebooks
72
93
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
73
94
  end
74
95
 
75
- def make_statement(limit=140, generator=nil)
96
+ def make_statement(limit=140, generator=nil, retry_limit=10)
76
97
  responding = !generator.nil?
77
98
  generator ||= SuffixGenerator.build(@sentences)
99
+
100
+ retries = 0
78
101
  tweet = ""
79
102
 
80
103
  while (tokens = generator.generate(3, :bigrams)) do
81
104
  next if tokens.length <= 3 && !responding
82
105
  break if valid_tweet?(tokens, limit)
106
+
107
+ retries += 1
108
+ break if retries >= retry_limit
83
109
  end
84
110
 
85
- if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
111
+ if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
86
112
  while (tokens = generator.generate(3, :unigrams)) do
87
- break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
113
+ break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
114
+
115
+ retries += 1
116
+ break if retries >= retry_limit
88
117
  end
89
118
  end
90
119
 
91
120
  tweet = NLP.reconstruct(tokens)
92
121
 
122
+ if retries >= retry_limit
123
+ log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
124
+ end
125
+
93
126
  fix tweet
94
127
  end
95
128
 
129
+ # Test if a sentence has been copied verbatim from original
130
+ def verbatim?(tokens)
131
+ @sentences.include?(tokens) || @mentions.include?(tokens)
132
+ end
133
+
96
134
  # Finds all relevant tokenized sentences to given input by
97
135
  # comparing non-stopword token overlaps
98
- def relevant_sentences(input)
136
+ def find_relevant(sentences, input)
99
137
  relevant = []
100
138
  slightly_relevant = []
101
139
 
102
- tokenized = NLP.tokenize(input)
140
+ tokenized = NLP.tokenize(input).map(&:downcase)
103
141
 
104
- @sentences.each do |sent|
142
+ sentences.each do |sent|
105
143
  tokenized.each do |token|
106
- if sent.include?(token)
144
+ if sent.map(&:downcase).include?(token)
107
145
  relevant << sent unless NLP.stopword?(token)
108
146
  slightly_relevant << sent
109
147
  end
@@ -115,9 +153,9 @@ module Ebooks
115
153
 
116
154
  # Generates a response by looking for related sentences
117
155
  # in the corpus and building a smaller generator from these
118
- def make_response(input, limit=140)
119
- # First try
120
- relevant, slightly_relevant = relevant_sentences(input)
156
+ def make_response(input, limit=140, sentences=@mentions)
157
+ # Prefer mentions
158
+ relevant, slightly_relevant = find_relevant(sentences, input)
121
159
 
122
160
  if relevant.length >= 3
123
161
  generator = SuffixGenerator.build(relevant)
@@ -125,6 +163,8 @@ module Ebooks
125
163
  elsif slightly_relevant.length >= 5
126
164
  generator = SuffixGenerator.build(slightly_relevant)
127
165
  make_statement(limit, generator)
166
+ elsif sentences.equal?(@mentions)
167
+ make_response(input, limit, @sentences)
128
168
  else
129
169
  make_statement(limit)
130
170
  end
@@ -44,7 +44,7 @@ module Ebooks
44
44
  verbatim = [tokens] # Verbatim sentences to avoid reproducing
45
45
 
46
46
  0.upto(passes-1) do
47
- puts NLP.reconstruct(tokens)
47
+ log NLP.reconstruct(tokens) if $debug
48
48
  varsites = {} # Map bigram start site => next token alternatives
49
49
 
50
50
  tokens.each_with_index do |token, i|
@@ -1,3 +1,3 @@
1
1
  module Ebooks
2
- VERSION = "2.1.0"
2
+ VERSION = "2.1.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_ebooks
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-16 00:00:00.000000000 Z
12
+ date: 2013-11-21 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest