RubyGems - twitter_ebooks - Versions diffs - 2.0.3 → 2.0.4 - Mend

twitter_ebooks 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/Gemfile.lock +12 -12
data/bin/ebooks +21 -6
data/data/ANC-all-count.txt +297241 -0
data/data/stopwords.txt +204 -0
data/data/wordfreq.json +1 -0
data/lib/twitter_ebooks/bot.rb +25 -7
data/lib/twitter_ebooks/markov.rb +55 -63
data/lib/twitter_ebooks/model.rb +57 -74
data/lib/twitter_ebooks/nlp.rb +90 -55
data/lib/twitter_ebooks/version.rb +1 -1
data/script/process_anc_data.rb +19 -0
data/skeleton/Procfile +1 -1
data/skeleton/bots.rb +0 -6
data/skeleton/corpus/README.md +1 -1
data/skeleton/run.rb +9 -0
data/test/keywords.rb +18 -0
data/twitter_ebooks.gemspec +3 -5
metadata +13 -40
data/skeleton/model/README.md +0 -1

data/lib/twitter_ebooks/bot.rb CHANGED Viewed

@@ -15,6 +15,10 @@ module Ebooks
     @@all = [] # List of all defined bots
     def self.all; @@all; end
+    def self.get(name)
+      all.find { |bot| bot.username == name }
+    end
     def initialize(username, &b)
       # Set defaults
       @username = username
@@ -30,8 +34,7 @@ module Ebooks
       STDERR.flush
     end
-    # Connects to tweetstream and opens event handlers for this bot
-    def start
+    def configure
       TweetStream.configure do |config|
         config.consumer_key = @consumer_key
         config.consumer_secret = @consumer_secret
@@ -48,6 +51,13 @@ module Ebooks
       @twitter = Twitter::Client.new
       @stream = TweetStream::Client.new
+    end
+    # Connects to tweetstream and opens event handlers for this bot
+    def start
+      configure
+      @on_startup.call if @on_startup
       @stream.on_error do |msg|
         log "ERROR: #{msg}"
@@ -77,13 +87,20 @@ module Ebooks
         mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
         reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
-        reply_mentions << ev[:user][:screen_name]
+        reply_mentions = [ev[:user][:screen_name]] + reply_mentions
         meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
+        meta[:limit] = 140 - meta[:reply_prefix].length
         mless = ev[:text]
-        ev.attrs[:entities][:user_mentions].reverse.each do |entity|
-          mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
+        begin
+          ev.attrs[:entities][:user_mentions].reverse.each do |entity|
+            mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
+          end
+        rescue Exception
+          p ev.attrs[:entities][:user_mentions]
+          p ev[:text]
+          raise
         end
         meta[:mentionless] = mless
@@ -92,7 +109,7 @@ module Ebooks
         # - The tweet is not being retweeted by somebody else
         # - Or soft-retweeted by somebody else
         if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
-          log "Mention from #{ev[:user][:screen_name]}: #{ev[:text]}"
+          log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
           @on_mention.call(ev, meta)
         else
           @on_timeline.call(ev, meta)
@@ -117,7 +134,7 @@ module Ebooks
         log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
         @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
       elsif ev.is_a? Twitter::Tweet
-        log "Replying to @#{ev[:user][:screen_name]}: #{text}"
+        log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
         @twitter.update(text, in_reply_to_status_id: ev[:id])
       else
         raise Exception("Don't know how to reply to a #{ev.class}")
@@ -138,6 +155,7 @@ module Ebooks
       @twitter.update(*args)
     end
+    def on_startup(&b); @on_startup = b; end
     def on_follow(&b); @on_follow = b; end
     def on_mention(&b); @on_mention = b; end
     def on_timeline(&b); @on_timeline = b; end

data/lib/twitter_ebooks/markov.rb CHANGED Viewed

@@ -1,69 +1,73 @@
 module Ebooks
+  # Special INTERIM token represents sentence boundaries
+  # This is so we can include start and end of statements in model
+  # Due to the way the sentence tokenizer works, can correspond
+  # to multiple actual parts of text (such as ^, $, \n and .?!)
+  INTERIM = :interim
+  # This is an ngram-based Markov model optimized to build from a
+  # tokenized sentence list without requiring too much transformation
   class MarkovModel
-    INTERIM = :interim # Special token marking newline/^/$ boundaries
-    attr_accessor :tokens
-    attr_reader :depth
-    def represent(token)
-      if token.nil? || token == "\n" || token.empty?
-        INTERIM
-      else
-        token
-      end
+    def self.build(sentences)
+      MarkovModel.new.consume(sentences)
     end
-    def consume(tokenized, depth=2)
-      @tokens = [INTERIM]
-      @depth = depth
-      tokenized.each do |tokens|
-        @tokens += tokens
-        @tokens << INTERIM
-      end
-      @model = {}
-      @tokens.each_with_index do |token, i|
-        prev_tokens = []
-        @depth.downto(1) do |j|
-          if i-j < 0; next
-          else; prev = represent(@tokens[i-j])
+    def consume(sentences)
+      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
+      # We map by both bigrams and unigrams so we can fall back to the latter in
+      # cases where an input bigram is unavailable, such as starting a sentence
+      @sentences = sentences
+      @unigrams = {}
+      @bigrams = {}
+      sentences.each_with_index do |tokens, i|
+        last_token = INTERIM
+        tokens.each_with_index do |token, j|
+          @unigrams[last_token] ||= []
+          @unigrams[last_token] << [i, j]
+          @bigrams[last_token] ||= {}
+          @bigrams[last_token][token] ||= []
+          if j == tokens.length-1 # Mark sentence endings
+            @unigrams[token] ||= []
+            @unigrams[token] << INTERIM
+            @bigrams[last_token][token] << INTERIM
+          else
+            @bigrams[last_token][token] << [i, j+1]
           end
-          prev_tokens << prev
-        end
-        1.upto(@depth) do |j|
-          break if j > prev_tokens.length
-          ngram = prev_tokens.last(j)
-          unless ngram == INTERIM && prev_tokens[-1] == INTERIM
-            @model[ngram] ||= []
-            @model[ngram] << represent(token)
-          end
+          last_token = token
         end
       end
       self
     end
+    def find_token(index)
+      if index == INTERIM
+        INTERIM
+      else
+        @sentences[index[0]][index[1]]
+      end
+    end
     def chain(tokens)
-      next_token = nil
-      @depth.downto(1).each do |i|
-        next if tokens.length < i
-        matches = @model[tokens.last(i)]
-        if matches
-          #p tokens.last(i)
-          #puts "=> #{matches.inspect}"
-          next_token = matches.sample
-          break
-        end
+      if tokens.length == 1
+        matches = @unigrams[tokens[0]]
+      else
+        matches = @bigrams[tokens[-2]][tokens[-1]]
       end
-      raise ArgumentError if next_token.nil?
+      if matches.empty?
+        # This should never happen unless a strange token is
+        # supplied from outside the dataset
+        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
+      end
-      if next_token == INTERIM
+      next_token = find_token(matches.sample)
+      if next_token == INTERIM # We chose to end the sentence
         return tokens
       else
         return chain(tokens + [next_token])
@@ -71,19 +75,7 @@ module Ebooks
     end
     def generate
-      tokens = chain([@model[[INTERIM]].sample])
-      NLP.reconstruct(tokens)
-    end
-    def serialize
-      { 'model' => @model,
-        'depth' => @depth }
-    end
-    def deserialize(data)
-      @model = data['model']
-      @depth = data['depth']
-      self
+      NLP.reconstruct(chain([INTERIM]))
     end
   end
 end

data/lib/twitter_ebooks/model.rb CHANGED Viewed

@@ -7,15 +7,14 @@ require 'digest/md5'
 module Ebooks
   class Model
-    attr_accessor :hash, :sentences, :tokenized, :markov
+    attr_accessor :hash, :sentences, :markov, :keywords
     def self.consume(txtpath)
       Model.new.consume(txtpath)
     end
     def self.load(path)
-      data = Marshal.load(File.read(path))
-      Model.new.deserialize(data)
+      Marshal.load(File.read(path))
     end
     def consume(txtpath)
@@ -23,7 +22,7 @@ module Ebooks
       @hash = Digest::MD5.hexdigest(File.read(txtpath))
       text = File.read(txtpath)
-      log "Removing commented lines and mentions"
+      log "Removing commented lines and mention tokens"
       lines = text.split("\n")
       keeping = []
@@ -34,70 +33,43 @@ module Ebooks
       end
       text = NLP.normalize(keeping.join("\n"))
-      log "Segmenting text into sentences of 140 characters or less"
-      @sentences = NLP.sentences(text).reject do |s|
-        s.length > 140 || s.count('"')%2 != 0
-      end
+      log "Segmenting text into sentences"
-      log "Tokenizing #{@sentences.length} sentences"
-      @tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
-      @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
+      sentences = NLP.sentences(text)
-      log "Building markov model (this may take a while)"
-      @markov = MarkovModel.new.consume(@tokenized)
+      log "Tokenizing #{sentences.length} sentences"
+      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
-      self
-    end
+      log "Building markov model"
+      @markov = MarkovModel.build(@sentences)
-    # Produces a hash with the data needed to quickly
-    # reconstruct this corpus object
-    def serialize
-      return { 'hash' => @hash,
-               'tokenized' => @tokenized,
-               'tokensets' => @tokensets,
-               'markov' => @markov.serialize }
+      log "Ranking keywords"
+      require 'benchmark'
+      puts Benchmark.measure {
+        @keywords = NLP.keywords(@sentences)
+        p @keywords.top(100)
+      }
+      self
     end
     def save(path)
-      data = self.serialize
       File.open(path, 'w') do |f|
-        f.write(Marshal.dump(data))
+        f.write(Marshal.dump(self))
       end
       self
     end
-    def deserialize(data)
-      @hash = data['hash']
-      @tokenized = data['tokenized']
-      @tokensets = data['tokensets']
-      @markov = MarkovModel.new.deserialize(data['markov'])
-      self
-    end
-    def replace_noun(sent)
-      tagged = NLP.tagger.add_tags(sent)
-      nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
-      to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
-      return sent if to_replace.nil?
-      replacement = NLP.nouns.sample
-      if to_replace.en.plural.length <= to_replace.length
-        replacement = replacement.en.plural(1)
-      end
-      sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
-      sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
-    end
     def fix(tweet)
       # This seems to require an external api call
-      begin
-        fixer = NLP.gingerice.parse(tweet)
-        log fixer if fixer['corrections']
-        tweet = fixer['result']
-      rescue Exception => e
-        log e.message
-        log e.backtrace
-      end
+      #begin
+      #  fixer = NLP.gingerice.parse(tweet)
+      #  log fixer if fixer['corrections']
+      #  tweet = fixer['result']
+      #rescue Exception => e
+      #  log e.message
+      #  log e.backtrace
+      #end
       NLP.htmlentities.decode tweet
     end
@@ -115,33 +87,44 @@ module Ebooks
       fix tweet
     end
-    # Generates a response by looking for related sentences
-    # in the corpus and building a smaller markov model from these
-    def markov_response(input, limit=140)
-      inputset = NLP.tokenset(input)
-      log "Input tokenset: #{inputset.to_a}"
+    # Finds all relevant tokenized sentences to given input by
+    # comparing non-stopword token overlaps
+    def relevant_sentences(input)
+      relevant = []
+      slightly_relevant = []
-      if inputset.empty?
-        # Very uninteresting input; no relevant response possible
-        return markov_statement(limit)
-      end
+      tokenized = NLP.tokenize(input)
-      # Let's find all the sentences that might be relevant
-      relevant = []
-      @tokensets.each_with_index.map do |set, i|
-        if inputset.intersection(set).length > 0
-          relevant << @tokenized[i]
+      @sentences.each do |sent|
+        tokenized.each do |token|
+          if sent.include?(token)
+            relevant << sent unless NLP.stopword?(token)
+            slightly_relevant << sent
+          end
         end
       end
-      log "Found #{relevant.length} relevant tokenset matches"
+      [relevant, slightly_relevant]
+    end
-      if relevant.length < 3
-        return markov_statement(limit)
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller markov model from these
+    def markov_response(input, limit=140)
+      # First try
+      relevant, slightly_relevant = relevant_sentences(input)
+      p relevant
+      p slightly_relevant.length
+      if relevant.length >= 3
+        markov = MarkovModel.new.consume(relevant)
+        markov_statement(limit, markov)
+      elsif slightly_relevant.length > 5
+        markov = MarkovModel.new.consume(slightly_relevant)
+        markov_statement(limit, markov)
+      else
+        markov_statement(limit)
       end
-      markov = MarkovModel.new.consume(relevant.sample(100))
-      markov_statement(limit, markov)
     end
   end
 end

data/lib/twitter_ebooks/nlp.rb CHANGED Viewed

@@ -1,12 +1,16 @@
 # encoding: utf-8
-require 'linguistics'
-Linguistics.use(:en, classes: [String])
+require 'fast-stemmer'
+require 'highscore'
 module Ebooks
   module NLP
-    # We don't necessarily want to use all of this stuff all the time
-    # Only load it when it is needed
+    # We deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of another token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+    # Lazy-load NLP libraries and resources
+    # Some of this stuff is pretty heavy and we don't necessarily need
+    # to be using it all of the time
     def self.stopwords
       @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
@@ -19,89 +23,102 @@ module Ebooks
     def self.adjectives
       @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
     end
-    def self.tokenizer
-      # This tokenizer is used for dividing sentences into words
-      # It's too slow for finding sentences in paragraphs, hence tactful
-      require 'tokenizer'
-      @tokenizer ||= Tokenizer::Tokenizer.new(:en)
-    end
-    def self.tactful
-      require 'tactful_tokenizer'
-      @tactful ||= TactfulTokenizer::Model.new
+    def self.wordfreq
+      @wordfreq ||= JSON.load(File.read(File.join(DATA_PATH, 'wordfreq.json')))
     end
+    # POS tagger
     def self.tagger
       require 'engtagger'
       @tagger ||= EngTagger.new
     end
-    def self.stemmer
-      require 'lingua/stemmer'
-      @stemmer ||= Lingua::Stemmer.new
-    end
+    # Gingerice text correction service
     def self.gingerice
       require 'gingerice'
       Gingerice::Parser.new # No caching for this one
     end
+    # For decoding html entities
     def self.htmlentities
       require 'htmlentities'
       @htmlentities ||= HTMLEntities.new
     end
-    ### Utility functions which wrap the above
+    ### Utility functions
-    def self.sentences(text)
-      tactful.tokenize_text(text)
-    end
+    # We don't really want to deal with all this weird unicode punctuation
     def self.normalize(text)
       htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
     end
+    # Split text into sentences
+    # We use ad hoc approach because fancy libraries do not deal
+    # especially well with tweet formatting, and we can fake solving
+    # the quote problem during generation
+    def self.sentences(text)
+      text.split(/\n+|(?<=[.?!])\s+/)
+    end
+    # Split a sentence into word-level tokens
+    # As above, this is ad hoc because tokenization libraries
+    # do not behave well wrt. things like emoticons and timestamps
     def self.tokenize(sentence)
-      # This is hacky, but an ad hoc approach seems to be
-      # most reliable for now. Tokenization libraries have oddities
-      # that are hard to correct.
-      sentence.split(/\s/).map do |token|
-        exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
-        if exceptions.find { |r| r.match(token) }
-          token
-        else
-          token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
-        end
-      end.flatten
+      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
+      sentence.split(regex)
     end
-    def self.tokenset(sentence)
+    def self.stem(word)
+      Stemmer::stem_word(word.downcase)
+    end
+    def self.keywords(sentences)
+      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
+      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
+      text = Highscore::Content.new(text)
+      text.configure do
+        #set :multiplier, 2
+        #set :upper_case, 3
+        #set :long_words, 2
+        #set :long_words_threshold, 15
+        #set :vowels, 1                     # => default: 0 = not considered
+        #set :consonants, 5                 # => default: 0 = not considered
+        #set :ignore_case, true             # => default: false
+        set :word_pattern, /(?<!@)(?<=\s)[\w']+/           # => default: /\w+/
+        #set :stemming, true                # => default: false
+      end
+      text.keywords
+    end
+    def self.stemset(sentence)
       tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
       tokens.map(&:downcase)
             .reject { |token| stopwords.include?(token) }
+            .map { |t| stemmer.stem(t) }
             .to_set
     end
-    def self.space_between?(token1, token2)
-      p1 = self.punctuation?(token1)
-      p2 = self.punctuation?(token2)
-      if p1 && p2 # "foo?!"
-        false
-      elsif !p1 && p2 # "foo."
-        false
-      elsif p1 && !p2 # "foo. rah"
-        true
-      else # "foo rah"
-        true
+    # Builds a token stem frequency map
+    def self.stemfreq(sentences)
+      freqmap = {}
+      sentences.flatten.each do |token|
+        stem = NLP.stem(token)
+        freqmap[stem] ||= 0
+        freqmap[stem] += 1
       end
+      freqmap
     end
+    # Takes a list of tokens and builds a nice-looking sentence
     def self.reconstruct(tokens)
-      # Put tokens back together into a nice looking sentence
       text = ""
       last_token = nil
       tokens.each do |token|
+        next if token == INTERIM
         text += ' ' if last_token && space_between?(last_token, token)
         text += token
         last_token = token
@@ -109,17 +126,35 @@ module Ebooks
       text
     end
-    # Deliberately limit our punctuation handling to stuff we can do consistently
-    # It'll just be a part of a token if we don't split it out, and that's fine
-    PUNCTUATION = ".?!,"
+    # Determine if we need to insert a space between two tokens
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
     def self.punctuation?(token)
       (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
     end
+    def self.stopword?(token)
+      @stopword_set ||= stopwords.map(&:downcase).to_set
+      @stopword_set.include?(token.downcase)
+    end
+    # Determine if a sample of text contains unmatched brackets or quotes
+    # This is one of the more frequent and noticeable failure modes for
+    # the markov generator; we can just tell it to retry
     def self.unmatched_enclosers?(text)
-      # Weird quotes are an instant giveaway. Let's do paren-matching.
-      enclosers = ['**', '""', '()', '[]', '``']
+      enclosers = ['**', '""', '()', '[]', '``', "''"]
       enclosers.each do |pair|
         starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
         ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')

data/lib/twitter_ebooks/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Ebooks
-  VERSION = "2.0.3"
+  VERSION = "2.0.4"
 end

data/script/process_anc_data.rb ADDED Viewed

@@ -0,0 +1,19 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'json'
+freqmap = {}
+data = File.read("data/ANC-all-count.txt")
+data = data.unpack("C*").pack("U*")
+data.lines.each do |l|
+  vals = l.split("\t")
+  freqmap[vals[0]] = vals[-1].to_i
+end
+File.open("data/wordfreq.json", 'w') do |f|
+  f.write(JSON.dump(freqmap))
+end

data/skeleton/Procfile CHANGED Viewed

	@@ -1 +1 @@
1	- worker: ruby ~~bots~~.rb start
1	+ worker: ruby run.rb start

data/skeleton/bots.rb CHANGED Viewed

@@ -39,9 +39,3 @@ Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
     # bot.tweet("hi")
   end
 end
-EM.run do
- Ebooks::Bot.all.each do |bot|
-    bot.start
-  end
-end

data/skeleton/corpus/README.md CHANGED Viewed

	@@ -1 +1 @@
1	- Put raw text files in here ~~and process them with `ebooks consume`~~ to ~~make~~ ~~Markov models~~.
1	+ Put any raw text files in here to be processed.

data/skeleton/run.rb ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+require_relative 'bots'
+EM.run do
+ Ebooks::Bot.all.each do |bot|
+    bot.start
+  end
+end