RubyGems - twitter_ebooks - Versions diffs - 2.1.0 → 2.1.1 - Mend

twitter_ebooks 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/Gemfile.lock +1 -1
data/README.md +1 -1
data/bin/ebooks +2 -0
data/lib/twitter_ebooks.rb +2 -0
data/lib/twitter_ebooks/model.rb +59 -19
data/lib/twitter_ebooks/suffix.rb +1 -1
data/lib/twitter_ebooks/version.rb +1 -1
metadata +2 -2

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    twitter_ebooks (2.0.7)
+    twitter_ebooks (2.1.1)
       engtagger
       fast-stemmer
       gingerice

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# twitter\_ebooks 2.1.0
+# twitter\_ebooks 2.1.1
 Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.

data/bin/ebooks CHANGED Viewed

@@ -2,6 +2,8 @@
 require 'twitter_ebooks'
+$debug = true
 module Ebooks
   APP_PATH = Dir.pwd # XXX do some recursive thing instead

data/lib/twitter_ebooks.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 gem 'minitest'
+$debug = false
 def log(*args)
   STDERR.puts args.map(&:to_s).join(' ')
   STDERR.flush

data/lib/twitter_ebooks/model.rb CHANGED Viewed

@@ -7,7 +7,7 @@ require 'digest/md5'
 module Ebooks
   class Model
-    attr_accessor :hash, :sentences, :generator, :keywords
+    attr_accessor :hash, :sentences, :mentions, :keywords
     def self.consume(txtpath)
       Model.new.consume(txtpath)
@@ -22,23 +22,44 @@ module Ebooks
       @hash = Digest::MD5.hexdigest(File.read(txtpath))
       text = File.read(txtpath)
-      log "Removing commented lines and mention tokens"
+      log "Removing commented lines and sorting mentions"
       lines = text.split("\n")
       keeping = []
+      mentions = []
       lines.each do |l|
-        next if l.start_with?('#') || l.include?('RT')
-        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
-        keeping << processed.join(' ')
+        next if l.start_with?('#') # Remove commented lines
+        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
+        if l.include?('@')
+          mentions << l
+        else
+          keeping << l
+        end
       end
-      text = NLP.normalize(keeping.join("\n"))
+      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
+      mention_text = NLP.normalize(mentions.join("\n"))
       log "Segmenting text into sentences"
-      sentences = NLP.sentences(text)
+      statements = NLP.sentences(text)
+      mentions = NLP.sentences(mention_text)
+      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
+      @sentences = []
+      @mentions = []
-      log "Tokenizing #{sentences.length} sentences"
-      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
+      statements.each do |s|
+        @sentences << NLP.tokenize(s).reject do |t|
+          t.start_with?('@') || t.start_with?('http')
+        end
+      end
+      mentions.each do |s|
+        @mentions << NLP.tokenize(s).reject do |t|
+          t.start_with?('@') || t.start_with?('http')
+        end
+      end
       log "Ranking keywords"
       @keywords = NLP.keywords(@sentences)
@@ -72,38 +93,55 @@ module Ebooks
       tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
     end
-    def make_statement(limit=140, generator=nil)
+    def make_statement(limit=140, generator=nil, retry_limit=10)
       responding = !generator.nil?
       generator ||= SuffixGenerator.build(@sentences)
+      retries = 0
       tweet = ""
       while (tokens = generator.generate(3, :bigrams)) do
         next if tokens.length <= 3 && !responding
         break if valid_tweet?(tokens, limit)
+        retries += 1
+        break if retries >= retry_limit
       end
-      if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
+      if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
         while (tokens = generator.generate(3, :unigrams)) do
-          break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
+          break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
+          retries += 1
+          break if retries >= retry_limit
         end
       end
       tweet = NLP.reconstruct(tokens)
+      if retries >= retry_limit
+        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
+      end
       fix tweet
     end
+    # Test if a sentence has been copied verbatim from original
+    def verbatim?(tokens)
+      @sentences.include?(tokens) || @mentions.include?(tokens)
+    end
     # Finds all relevant tokenized sentences to given input by
     # comparing non-stopword token overlaps
-    def relevant_sentences(input)
+    def find_relevant(sentences, input)
       relevant = []
       slightly_relevant = []
-      tokenized = NLP.tokenize(input)
+      tokenized = NLP.tokenize(input).map(&:downcase)
-      @sentences.each do |sent|
+      sentences.each do |sent|
         tokenized.each do |token|
-          if sent.include?(token)
+          if sent.map(&:downcase).include?(token)
             relevant << sent unless NLP.stopword?(token)
             slightly_relevant << sent
           end
@@ -115,9 +153,9 @@ module Ebooks
     # Generates a response by looking for related sentences
     # in the corpus and building a smaller generator from these
-    def make_response(input, limit=140)
-      # First try
-      relevant, slightly_relevant = relevant_sentences(input)
+    def make_response(input, limit=140, sentences=@mentions)
+      # Prefer mentions
+      relevant, slightly_relevant = find_relevant(sentences, input)
       if relevant.length >= 3
         generator = SuffixGenerator.build(relevant)
@@ -125,6 +163,8 @@ module Ebooks
       elsif slightly_relevant.length >= 5
         generator = SuffixGenerator.build(slightly_relevant)
         make_statement(limit, generator)
+      elsif sentences.equal?(@mentions)
+        make_response(input, limit, @sentences)
       else
         make_statement(limit)
       end

data/lib/twitter_ebooks/suffix.rb CHANGED Viewed

@@ -44,7 +44,7 @@ module Ebooks
       verbatim = [tokens] # Verbatim sentences to avoid reproducing
       0.upto(passes-1) do
-        puts NLP.reconstruct(tokens)
+        log NLP.reconstruct(tokens) if $debug
         varsites = {} # Map bigram start site => next token alternatives
         tokens.each_with_index do |token, i|

data/lib/twitter_ebooks/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Ebooks
-  VERSION = "2.1.0"
+  VERSION = "2.1.1"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: twitter_ebooks
 version: !ruby/object:Gem::Version
-  version: 2.1.0
+  version: 2.1.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-16 00:00:00.000000000 Z
+date: 2013-11-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest