RubyGems - twitter_ebooks - Versions diffs - 2.0.0 - Mend

twitter_ebooks 2.0.0

Files changed (24) hide show

data/.gitignore +1 -0
data/Gemfile +4 -0
data/Gemfile.lock +78 -0
data/NOTES.md +4 -0
data/README.md +20 -0
data/bin/ebooks +83 -0
data/data/adjectives.txt +1466 -0
data/data/nouns.txt +2193 -0
data/data/stopwords.txt +639 -0
data/lib/twitter_ebooks/archiver.rb +86 -0
data/lib/twitter_ebooks/bot.rb +145 -0
data/lib/twitter_ebooks/markov.rb +89 -0
data/lib/twitter_ebooks/model.rb +147 -0
data/lib/twitter_ebooks/nlp.rb +142 -0
data/lib/twitter_ebooks/version.rb +3 -0
data/lib/twitter_ebooks.rb +20 -0
data/skeleton/Procfile +1 -0
data/skeleton/bots.rb +47 -0
data/skeleton/corpus/README.md +1 -0
data/skeleton/model/README.md +1 -0
data/test/corpus/0xabad1dea.tweets +14696 -0
data/test/tokenize.rb +18 -0
data/twitter_ebooks.gemspec +30 -0
metadata +247 -0

data/lib/twitter_ebooks/bot.rb ADDED Viewed

@@ -0,0 +1,145 @@
+#!/usr/bin/env ruby
+require 'twitter'
+require 'tweetstream'
+require 'rufus/scheduler'
+module Ebooks
+  class Bot
+    attr_accessor :consumer_key, :consumer_secret,
+                  :oauth_token, :oauth_token_secret
+    attr_accessor :username
+    attr_reader :twitter, :stream
+    @@all = [] # List of all defined bots
+    def self.all; @@all; end
+    def initialize(username, &b)
+      # Set defaults
+      @username = username
+      # Override with callback
+      b.call(self)
+      Bot.all.push(self)
+    end
+    def log(*args)
+      STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
+      STDERR.flush
+    end
+    # Connects to tweetstream and opens event handlers for this bot
+    def start
+      TweetStream.configure do |config|
+        config.consumer_key = @consumer_key
+        config.consumer_secret = @consumer_secret
+        config.oauth_token = @oauth_token
+        config.oauth_token_secret = @oauth_token_secret
+      end
+      Twitter.configure do |config|
+        config.consumer_key = @consumer_key
+        config.consumer_secret = @consumer_secret
+        config.oauth_token = @oauth_token
+        config.oauth_token_secret = @oauth_token_secret
+      end
+      @twitter = Twitter::Client.new
+      @stream = TweetStream::Client.new
+      @stream.on_error do |msg|
+        log "ERROR: #{msg}"
+      end
+      @stream.on_inited do
+        log "Online!"
+      end
+      @stream.on_event(:follow) do |event|
+        log "Followed by #{event[:source][:screen_name]}"
+        @on_follow.call(event[:source])
+      end
+      @stream.on_direct_message do |dm|
+        next if dm[:sender][:screen_name] == @username # Don't reply to self
+        log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
+        @on_message.call(dm)
+      end
+      @stream.userstream do |ev|
+        next unless ev[:text] # If it's not a text-containing tweet, ignore it
+        next if ev[:user][:screen_name] == @username # Ignore our own tweets
+        meta = {}
+        mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
+        reply_mentions = mentions.reject { |m| m.downcase == @username }
+        reply_mentions << ev[:user][:screen_name]
+        meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
+        mless = ev[:text]
+        ev.attrs[:entities][:user_mentions].reverse.each do |entity|
+          mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
+        end
+        meta[:mentionless] = mless
+        # To check if this is a mention, ensure:
+        # - The tweet mentions list contains our username
+        # - The tweet is not being retweeted by somebody else
+        # - Or soft-retweeted by somebody else
+        if mentions.map(&:downcase).include?(@username) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
+          log "Mention from #{ev[:user][:screen_name]}: #{ev[:text]}"
+          @on_mention.call(ev, meta)
+        else
+          @on_timeline.call(ev, meta)
+        end
+      end
+    end
+    # Wrapper for EM.add_timer
+    # Delays add a greater sense of humanity to bot behaviour
+    def delay(time, &b)
+      time = time.to_a.sample unless time.is_a? Integer
+      EM.add_timer(time, &b)
+    end
+    # Reply to a tweet or a DM.
+    # Applies configurable @reply_delay range
+    def reply(ev, text, opts={})
+      opts = opts.clone
+      delay = @reply_delay.to_a.sample
+      if ev.is_a? Twitter::DirectMessage
+        log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
+        @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
+      elsif ev.is_a? Twitter::Tweet
+        log "Replying to @#{ev[:user][:screen_name]}: #{text}"
+        @twitter.update(text, in_reply_to_status_id: ev[:id])
+      else
+        raise Exception("Don't know how to reply to a #{ev.class}")
+      end
+    end
+    def scheduler
+      @scheduler ||= Rufus::Scheduler.new
+    end
+    def follow(*args)
+      log "Following #{args}"
+      @twitter.follow(*args)
+    end
+    def tweet(*args)
+      log "Tweeting #{args.inspect}"
+      @twitter.update(*args)
+    end
+    def on_follow(&b); @on_follow = b; end
+    def on_mention(&b); @on_mention = b; end
+    def on_timeline(&b); @on_timeline = b; end
+    def on_message(&b); @on_message = b; end
+  end
+end

data/lib/twitter_ebooks/markov.rb ADDED Viewed

@@ -0,0 +1,89 @@
+module Ebooks
+  class MarkovModel
+    INTERIM = :interim # Special token marking newline/^/$ boundaries
+    attr_accessor :tokens
+    attr_reader :depth
+    def represent(token)
+      if token.nil? || token == "\n" || token.empty?
+        INTERIM
+      else
+        token
+      end
+    end
+    def consume(tokenized, depth=2)
+      @tokens = [INTERIM]
+      @depth = depth
+      tokenized.each do |tokens|
+        @tokens += tokens
+        @tokens << INTERIM
+      end
+      @model = {}
+      @tokens.each_with_index do |token, i|
+        prev_tokens = []
+        @depth.downto(1) do |j|
+          if i-j < 0; next
+          else; prev = represent(@tokens[i-j])
+          end
+          prev_tokens << prev
+        end
+        1.upto(@depth) do |j|
+          break if j > prev_tokens.length
+          ngram = prev_tokens.last(j)
+          unless ngram == INTERIM && prev_tokens[-1] == INTERIM
+            @model[ngram] ||= []
+            @model[ngram] << represent(token)
+          end
+        end
+      end
+      self
+    end
+    def chain(tokens)
+      next_token = nil
+      @depth.downto(1).each do |i|
+        next if tokens.length < i
+        matches = @model[tokens.last(i)]
+        if matches
+          #p tokens.last(i)
+          #puts "=> #{matches.inspect}"
+          next_token = matches.sample
+          break
+        end
+      end
+      raise ArgumentError if next_token.nil?
+      if next_token == INTERIM
+        return tokens
+      else
+        return chain(tokens + [next_token])
+      end
+    end
+    def generate
+      tokens = chain([@model[[INTERIM]].sample])
+      NLP.reconstruct(tokens)
+    end
+    def serialize
+      { 'model' => @model,
+        'depth' => @depth }
+    end
+    def deserialize(data)
+      @model = data['model']
+      @depth = data['depth']
+      self
+    end
+  end
+end

data/lib/twitter_ebooks/model.rb ADDED Viewed

@@ -0,0 +1,147 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'json'
+require 'set'
+require 'digest/md5'
+module Ebooks
+  class Model
+    attr_accessor :hash, :sentences, :tokenized, :markov
+    def self.consume(txtpath)
+      Model.new.consume(txtpath)
+    end
+    def self.load(path)
+      data = Marshal.load(File.read(path))
+      Model.new.deserialize(data)
+    end
+    def consume(txtpath)
+      # Record hash of source file so we know to update later
+      @hash = Digest::MD5.hexdigest(File.read(txtpath))
+      text = File.read(txtpath)
+      log "Removing commented lines and mentions"
+      lines = text.split("\n")
+      keeping = []
+      lines.each do |l|
+        next if l.start_with?('#') || l.include?('RT')
+        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
+        keeping << processed.join(' ')
+      end
+      text = NLP.normalize(keeping.join("\n"))
+      log "Segmenting text into sentences of 140 characters or less"
+      @sentences = NLP.sentences(text).reject do |s|
+        s.length > 140 || s.count('"')%2 != 0
+      end
+      log "Tokenizing #{@sentences.length} sentences"
+      @tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
+      @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
+      log "Building markov model (this may take a while)"
+      @markov = MarkovModel.new.consume(@tokenized)
+      self
+    end
+    # Produces a hash with the data needed to quickly
+    # reconstruct this corpus object
+    def serialize
+      return { 'hash' => @hash,
+               'tokenized' => @tokenized,
+               'tokensets' => @tokensets,
+               'markov' => @markov.serialize }
+    end
+    def save(path)
+      data = self.serialize
+      File.open(path, 'w') do |f|
+        f.write(Marshal.dump(data))
+      end
+      self
+    end
+    def deserialize(data)
+      @hash = data['hash']
+      @tokenized = data['tokenized']
+      @tokensets = data['tokensets']
+      @markov = MarkovModel.new.deserialize(data['markov'])
+      self
+    end
+    def replace_noun(sent)
+      tagged = NLP.tagger.add_tags(sent)
+      nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
+      to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
+      return sent if to_replace.nil?
+      replacement = NLP.nouns.sample
+      if to_replace.en.plural.length <= to_replace.length
+        replacement = replacement.en.plural(1)
+      end
+      sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
+      sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
+    end
+    def fix(tweet)
+      # This seems to require an external api call
+      begin
+        fixer = NLP.gingerice.parse(tweet)
+        log fixer if fixer['corrections']
+        tweet = fixer['result']
+      rescue Exception => e
+        log e.message
+        log e.backtrace
+      end
+      NLP.htmlentities.decode tweet
+    end
+    def markov_statement(limit=140, markov=nil)
+      markov ||= @markov
+      tweet = ""
+      while (tweet = markov.generate) do
+        next if tweet.length > limit
+        next if NLP.unmatched_enclosers?(tweet)
+        break if tweet.length > limit*0.4 || rand > 0.8
+      end
+      fix tweet
+    end
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller markov model from these
+    def markov_response(input, limit=140)
+      inputset = NLP.tokenset(input)
+      log "Input tokenset: #{inputset.to_a}"
+      if inputset.empty?
+        # Very uninteresting input; no relevant response possible
+        return markov_statement(limit)
+      end
+      # Let's find all the sentences that might be relevant
+      relevant = []
+      @tokensets.each_with_index.map do |set, i|
+        if inputset.intersection(set).length > 0
+          relevant << @tokenized[i]
+        end
+      end
+      log "Found #{relevant.length} relevant tokenset matches"
+      if relevant.length < 3
+        return markov_statement(limit)
+      end
+      markov = MarkovModel.new.consume(relevant.sample(100))
+      markov_statement(limit, markov)
+    end
+  end
+end

data/lib/twitter_ebooks/nlp.rb ADDED Viewed

@@ -0,0 +1,142 @@
+# encoding: utf-8
+require 'linguistics'
+Linguistics.use(:en, classes: [String])
+module Ebooks
+  module NLP
+    # We don't necessarily want to use all of this stuff all the time
+    # Only load it when it is needed
+    def self.stopwords
+      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
+    end
+    def self.nouns
+      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
+    end
+    def self.adjectives
+      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
+    end
+    def self.tokenizer
+      # This tokenizer is used for dividing sentences into words
+      # It's too slow for finding sentences in paragraphs, hence tactful
+      require 'tokenizer'
+      @tokenizer ||= Tokenizer::Tokenizer.new(:en)
+    end
+    def self.tactful
+      require 'tactful_tokenizer'
+      @tactful ||= TactfulTokenizer::Model.new
+    end
+    def self.tagger
+      require 'engtagger'
+      @tagger ||= EngTagger.new
+    end
+    def self.stemmer
+      require 'lingua/stemmer'
+      @stemmer ||= Lingua::Stemmer.new
+    end
+    def self.gingerice
+      require 'gingerice'
+      Gingerice::Parser.new # No caching for this one
+    end
+    def self.htmlentities
+      require 'htmlentities'
+      @htmlentities ||= HTMLEntities.new
+    end
+    ### Utility functions which wrap the above
+    def self.sentences(text)
+      tactful.tokenize_text(text)
+    end
+    def self.normalize(text)
+      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
+    end
+    def self.tokenize(sentence)
+      # This is hacky, but an ad hoc approach seems to be
+      # most reliable for now. Tokenization libraries have oddities
+      # that are hard to correct.
+      sentence.split(/\s/).map do |token|
+        exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
+        if exceptions.find { |r| r.match(token) }
+          token
+        else
+          token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
+        end
+      end.flatten
+    end
+    def self.tokenset(sentence)
+      tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
+      tokens.map(&:downcase)
+            .reject { |token| stopwords.include?(token) }
+            .to_set
+    end
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
+    def self.reconstruct(tokens)
+      # Put tokens back together into a nice looking sentence
+      text = ""
+      last_token = nil
+      tokens.each do |token|
+        text += ' ' if last_token && space_between?(last_token, token)
+        text += token
+        last_token = token
+      end
+      text
+    end
+    # Deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of a token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+    def self.punctuation?(token)
+      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
+    end
+    def self.unmatched_enclosers?(text)
+      # Weird quotes are an instant giveaway. Let's do paren-matching.
+      enclosers = ['**', '""', '()', '[]', '``']
+      enclosers.each do |pair|
+        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
+        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
+        opened = 0
+        tokenize(text).each do |token|
+          opened += 1 if token.match(starter)
+          opened -= 1 if token.match(ender)
+          return true if opened < 0 # Too many ends!
+        end
+        return true if opened != 0 # Mismatch somewhere.
+      end
+      false
+    end
+  end
+end

data/lib/twitter_ebooks/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Ebooks
+  VERSION = "2.0.0"
+end

data/lib/twitter_ebooks.rb ADDED Viewed

@@ -0,0 +1,20 @@
+gem 'minitest'
+def log(*args)
+  STDERR.puts args.map(&:to_s).join(' ')
+  STDERR.flush
+end
+module Ebooks
+  GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+  DATA_PATH = File.join(GEM_PATH, 'data')
+  SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
+  TEST_PATH = File.join(GEM_PATH, 'test')
+  TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
+end
+require 'twitter_ebooks/nlp'
+require 'twitter_ebooks/archiver'
+require 'twitter_ebooks/markov'
+require 'twitter_ebooks/model'
+require 'twitter_ebooks/bot'

data/skeleton/Procfile ADDED Viewed

	@@ -0,0 +1 @@
1	+ worker: ruby bots.rb start

data/skeleton/bots.rb ADDED Viewed

@@ -0,0 +1,47 @@
+#!/usr/bin/env ruby
+require 'twitter_ebooks'
+# This is an example bot definition with event handlers commented out
+# You can define as many of these as you like; they will run simultaneously
+Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
+  # Consumer details come from registering an app at https://dev.twitter.com/
+  # OAuth details can be fetched with https://github.com/marcel/twurl
+  bot.consumer_key = "" # Your app consumer key
+  bot.consumer_secret = "" # Your app consumer secret
+  bot.oauth_token = "" # Token connecting the app to this account
+  bot.oauth_token_secret = "" # Secret connecting the app to this account
+  bot.on_message do |dm|
+    # Reply to a DM
+    # bot.reply(dm, "secret secrets")
+  end
+  bot.on_follow do |user|
+    # Follow a user back
+    # bot.follow(user[:screen_name])
+  end
+  bot.on_mention do |tweet, meta|
+    # Reply to a mention
+    # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
+  end
+  bot.on_timeline do |tweet, meta|
+    # Reply to a tweet in the bot's timeline
+    # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
+  end
+  bot.scheduler.every '24h' do
+    # Tweet something every 24 hours
+    # See https://github.com/jmettraux/rufus-scheduler
+    # bot.tweet("hi")
+  end
+end
+EM.run do
+ Ebooks::Bot.all.each do |bot|
+    bot.start
+  end
+end

data/skeleton/corpus/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ Put raw text files in here and process them with `ebooks consume` to make Markov models.

data/skeleton/model/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ This is where the output of `ebooks consume <corpus_path>` goes. You can load these files using Model.load(path), and `ebooks gen <path>` for testing.