RubyGems - twitter_ebooks_poll - Versions diffs - 3.2.0 - Mend

twitter_ebooks_poll 3.2.0

Files changed (36) hide show

checksums.yaml +7 -0
data/.gitattributes +2 -0
data/.gitignore +6 -0
data/.rspec +1 -0
data/.travis.yml +8 -0
data/Gemfile +4 -0
data/LICENSE +22 -0
data/README.md +167 -0
data/Rakefile +2 -0
data/bin/ebooks +449 -0
data/data/adjectives.txt +1466 -0
data/data/nouns.txt +2193 -0
data/lib/twitter_ebooks/archive.rb +116 -0
data/lib/twitter_ebooks/bot.rb +521 -0
data/lib/twitter_ebooks/model.rb +336 -0
data/lib/twitter_ebooks/nlp.rb +195 -0
data/lib/twitter_ebooks/suffix.rb +104 -0
data/lib/twitter_ebooks/sync.rb +52 -0
data/lib/twitter_ebooks/version.rb +3 -0
data/lib/twitter_ebooks.rb +22 -0
data/skeleton/Gemfile +4 -0
data/skeleton/Procfile +1 -0
data/skeleton/bots.rb +65 -0
data/skeleton/corpus/.gitignore +0 -0
data/skeleton/gitignore +1 -0
data/skeleton/image/.gitignore +0 -0
data/skeleton/model/.gitignore +0 -0
data/skeleton/stopwords.txt +843 -0
data/spec/bot_spec.rb +216 -0
data/spec/data/0xabad1dea.json +203945 -0
data/spec/data/0xabad1dea.model +6158 -1
data/spec/memprof.rb +37 -0
data/spec/model_spec.rb +88 -0
data/spec/spec_helper.rb +6 -0
data/twitter_ebooks.gemspec +37 -0
metadata +309 -0

data/lib/twitter_ebooks/model.rb ADDED Viewed

@@ -0,0 +1,336 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'json'
+require 'set'
+require 'digest/md5'
+require 'csv'
+module Ebooks
+  class Model
+    # @return [Array<String>]
+    # An array of unique tokens. This is the main source of actual strings
+    # in the model. Manipulation of a token is done using its index
+    # in this array, which we call a "tiki"
+    attr_accessor :tokens
+    # @return [Array<Array<Integer>>]
+    # Sentences represented by arrays of tikis
+    attr_accessor :sentences
+    # @return [Array<Array<Integer>>]
+    # Sentences derived from Twitter mentions
+    attr_accessor :mentions
+    # @return [Array<String>]
+    # The top 200 most important keywords, in descending order
+    attr_accessor :keywords
+    # Generate a new model from a corpus file
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.consume(path)
+      Model.new.consume(path)
+    end
+    # Generate a new model from multiple corpus files
+    # @param paths [Array<String>]
+    # @return [Ebooks::Model]
+    def self.consume_all(paths)
+      Model.new.consume_all(paths)
+    end
+    # Load a saved model
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.load(path)
+      model = Model.new
+      model.instance_eval do
+        props = Marshal.load(File.open(path, 'rb') { |f| f.read })
+        @tokens = props[:tokens]
+        @sentences = props[:sentences]
+        @mentions = props[:mentions]
+        @keywords = props[:keywords]
+      end
+      model
+    end
+    # Save model to a file
+    # @param path [String]
+    def save(path)
+      File.open(path, 'wb') do |f|
+        f.write(Marshal.dump({
+          tokens: @tokens,
+          sentences: @sentences,
+          mentions: @mentions,
+          keywords: @keywords
+        }))
+      end
+      self
+    end
+    # Append a generated model to existing model file instead of overwriting it
+    # @param path [String]
+    def append(path)
+      existing = File.file?(path)
+      if !existing
+        log "No existing model found at #{path}"
+        return
+      else
+        #read-in and deserialize existing model
+        props = Marshal.load(File.open(path,'rb') { |old| old.read })
+        old_tokens = props[:tokens]
+        old_sentences = props[:sentences]
+        old_mentions = props[:mentions]
+        old_keywords = props[:keywords]
+        #append existing properties to new ones and overwrite with new model
+        File.open(path, 'wb') do |f|
+          f.write(Marshal.dump({
+            tokens: @tokens.concat(old_tokens),
+            sentences: @sentences.concat(old_sentences),
+            mentions: @mentions.concat(old_mentions),
+            keywords: @keywords.concat(old_keywords)
+          }))
+        end
+      end
+      self
+    end
+    def initialize
+      @tokens = []
+      # Reverse lookup tiki by token, for faster generation
+      @tikis = {}
+    end
+    # Reverse lookup a token index from a token
+    # @param token [String]
+    # @return [Integer]
+    def tikify(token)
+      if @tikis.has_key?(token) then
+        return @tikis[token]
+      else
+        (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
+        @tokens << token
+        return @tikis[token] = @tokens.length-1
+      end
+    end
+    # Convert a body of text into arrays of tikis
+    # @param text [String]
+    # @return [Array<Array<Integer>>]
+    def mass_tikify(text)
+      sentences = NLP.sentences(text)
+      sentences.map do |s|
+        tokens = NLP.tokenize(s).reject do |t|
+          # Don't include usernames/urls as tokens
+          t.include?('@') || t.include?('http')
+        end
+        tokens.map { |t| tikify(t) }
+      end
+    end
+    # Consume a corpus into this model
+    # @param path [String]
+    def consume(path)
+      content = File.read(path, :encoding => 'utf-8')
+      if path.split('.')[-1] == "json"
+        log "Reading json corpus from #{path}"
+        lines = JSON.parse(content).map do |tweet|
+          tweet['text']
+        end
+      elsif path.split('.')[-1] == "csv"
+        log "Reading CSV corpus from #{path}"
+        content = CSV.parse(content)
+        header = content.shift
+        text_col = header.index('text')
+        lines = content.map do |tweet|
+          tweet[text_col]
+        end
+      else
+        log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
+        lines = content.split("\n")
+      end
+      consume_lines(lines)
+    end
+    # Consume a sequence of lines
+    # @param lines [Array<String>]
+    def consume_lines(lines)
+      log "Removing commented lines and sorting mentions"
+      statements = []
+      mentions = []
+      lines.each do |l|
+        next if l.start_with?('#') # Remove commented lines
+        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
+        if l.include?('@')
+          mentions << NLP.normalize(l)
+        else
+          statements << NLP.normalize(l)
+        end
+      end
+      text = statements.join("\n").encode('UTF-8', :invalid => :replace)
+      mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
+      lines = nil; statements = nil; mentions = nil # Allow garbage collection
+      log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
+      @sentences = mass_tikify(text)
+      @mentions = mass_tikify(mention_text)
+      log "Ranking keywords"
+      @keywords = NLP.keywords(text).top(200).map(&:to_s)
+      log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
+      self
+    end
+    # Consume multiple corpuses into this model
+    # @param paths [Array<String>]
+    def consume_all(paths)
+      lines = []
+      paths.each do |path|
+        content = File.read(path, :encoding => 'utf-8')
+        if path.split('.')[-1] == "json"
+          log "Reading json corpus from #{path}"
+          l = JSON.parse(content).map do |tweet|
+            tweet['text']
+          end
+          lines.concat(l)
+        elsif path.split('.')[-1] == "csv"
+          log "Reading CSV corpus from #{path}"
+          content = CSV.parse(content)
+          header = content.shift
+          text_col = header.index('text')
+          l = content.map do |tweet|
+            tweet[text_col]
+          end
+          lines.concat(l)
+        else
+          log "Reading plaintext corpus from #{path}"
+          l = content.split("\n")
+          lines.concat(l)
+        end
+      end
+      consume_lines(lines)
+    end
+    # Correct encoding issues in generated text
+    # @param text [String]
+    # @return [String]
+    def fix(text)
+      NLP.htmlentities.decode text
+    end
+    # Check if an array of tikis comprises a valid tweet
+    # @param tikis [Array<Integer>]
+    # @param limit Integer how many chars we have left
+    def valid_tweet?(tikis, limit)
+      tweet = NLP.reconstruct(tikis, @tokens)
+      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
+    end
+    # Generate some text
+    # @param limit [Integer] available characters
+    # @param generator [SuffixGenerator, nil]
+    # @param retry_limit [Integer] how many times to retry on invalid tweet
+    # @return [String]
+    def make_statement(limit=140, generator=nil, retry_limit=10)
+      responding = !generator.nil?
+      generator ||= SuffixGenerator.build(@sentences)
+      retries = 0
+      tweet = ""
+      while (tikis = generator.generate(3, :bigrams)) do
+        #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
+        break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
+        retries += 1
+        break if retries >= retry_limit
+      end
+      if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
+        #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
+        while (tikis = generator.generate(3, :unigrams)) do
+          break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
+          retries += 1
+          break if retries >= retry_limit
+        end
+      end
+      tweet = NLP.reconstruct(tikis, @tokens)
+      if retries >= retry_limit
+        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
+      end
+      fix tweet
+    end
+    # Test if a sentence has been copied verbatim from original
+    # @param tikis [Array<Integer>]
+    # @return [Boolean]
+    def verbatim?(tikis)
+      @sentences.include?(tikis) || @mentions.include?(tikis)
+    end
+    # Finds relevant and slightly relevant tokenized sentences to input
+    # comparing non-stopword token overlaps
+    # @param sentences [Array<Array<Integer>>]
+    # @param input [String]
+    # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
+    def find_relevant(sentences, input)
+      relevant = []
+      slightly_relevant = []
+      tokenized = NLP.tokenize(input).map(&:downcase)
+      sentences.each do |sent|
+        tokenized.each do |token|
+          if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
+            relevant << sent unless NLP.stopword?(token)
+            slightly_relevant << sent
+          end
+        end
+      end
+      [relevant, slightly_relevant]
+    end
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller generator from these
+    # @param input [String]
+    # @param limit [Integer] characters available for response
+    # @param sentences [Array<Array<Integer>>]
+    # @return [String]
+    def make_response(input, limit=140, sentences=@mentions)
+      # Prefer mentions
+      relevant, slightly_relevant = find_relevant(sentences, input)
+      if relevant.length >= 3
+        generator = SuffixGenerator.build(relevant)
+        make_statement(limit, generator)
+      elsif slightly_relevant.length >= 5
+        generator = SuffixGenerator.build(slightly_relevant)
+        make_statement(limit, generator)
+      elsif sentences.equal?(@mentions)
+        make_response(input, limit, @sentences)
+      else
+        make_statement(limit)
+      end
+    end
+  end
+end

data/lib/twitter_ebooks/nlp.rb ADDED Viewed

@@ -0,0 +1,195 @@
+# encoding: utf-8
+require 'fast-stemmer'
+require 'highscore'
+require 'htmlentities'
+module Ebooks
+  module NLP
+    # We deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of another token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+    # Lazy-load NLP libraries and resources
+    # Some of this stuff is pretty heavy and we don't necessarily need
+    # to be using it all of the time
+    # Lazily loads an array of stopwords
+    # Stopwords are common words that should often be ignored
+    # @return [Array<String>]
+    def self.stopwords
+      @stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
+    end
+    # Lazily loads an array of known English nouns
+    # @return [Array<String>]
+    def self.nouns
+      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
+    end
+    # Lazily loads an array of known English adjectives
+    # @return [Array<String>]
+    def self.adjectives
+      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
+    end
+    # Lazily load part-of-speech tagging library
+    # This can determine whether a word is being used as a noun/adjective/verb
+    # @return [EngTagger]
+    def self.tagger
+      require 'engtagger'
+      @tagger ||= EngTagger.new
+    end
+    # Lazily load HTML entity decoder
+    # @return [HTMLEntities]
+    def self.htmlentities
+      @htmlentities ||= HTMLEntities.new
+    end
+    ### Utility functions
+    # Normalize some strange unicode punctuation variants
+    # @param text [String]
+    # @return [String]
+    def self.normalize(text)
+      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
+    end
+    # Split text into sentences
+    # We use ad hoc approach because fancy libraries do not deal
+    # especially well with tweet formatting, and we can fake solving
+    # the quote problem during generation
+    # @param text [String]
+    # @return [Array<String>]
+    def self.sentences(text)
+      text.split(/\n+|(?<=[.?!])\s+/)
+    end
+    # Split a sentence into word-level tokens
+    # As above, this is ad hoc because tokenization libraries
+    # do not behave well wrt. things like emoticons and timestamps
+    # @param sentence [String]
+    # @return [Array<String>]
+    def self.tokenize(sentence)
+      regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
+      sentence.split(regex)
+    end
+    # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
+    # @param word [String]
+    # @return [String]
+    def self.stem(word)
+      Stemmer::stem_word(word.downcase)
+    end
+    # Use highscore gem to find interesting keywords in a corpus
+    # @param text [String]
+    # @return [Highscore::Keywords]
+    def self.keywords(text)
+      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
+      text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
+      text = Highscore::Content.new(text)
+      text.configure do
+        #set :multiplier, 2
+        #set :upper_case, 3
+        #set :long_words, 2
+        #set :long_words_threshold, 15
+        #set :vowels, 1                     # => default: 0 = not considered
+        #set :consonants, 5                 # => default: 0 = not considered
+        #set :ignore_case, true             # => default: false
+        set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/           # => default: /\w+/
+        #set :stemming, true                # => default: false
+      end
+      text.keywords
+    end
+    # Builds a proper sentence from a list of tikis
+    # @param tikis [Array<Integer>]
+    # @param tokens [Array<String>]
+    # @return [String]
+    def self.reconstruct(tikis, tokens)
+      text = ""
+      last_token = nil
+      tikis.each do |tiki|
+        next if tiki == INTERIM
+        token = tokens[tiki]
+        text += ' ' if last_token && space_between?(last_token, token)
+        text += token
+        last_token = token
+      end
+      text
+    end
+    # Determine if we need to insert a space between two tokens
+    # @param token1 [String]
+    # @param token2 [String]
+    # @return [Boolean]
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
+    # Is this token comprised of punctuation?
+    # @param token [String]
+    # @return [Boolean]
+    def self.punctuation?(token)
+      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
+    end
+    # Is this token a stopword?
+    # @param token [String]
+    # @return [Boolean]
+    def self.stopword?(token)
+      @stopword_set ||= stopwords.map(&:downcase).to_set
+      @stopword_set.include?(token.downcase)
+    end
+    # Determine if a sample of text contains unmatched brackets or quotes
+    # This is one of the more frequent and noticeable failure modes for
+    # the generator; we can just tell it to retry
+    # @param text [String]
+    # @return [Boolean]
+    def self.unmatched_enclosers?(text)
+      enclosers = ['**', '""', '()', '[]', '``', "''"]
+      enclosers.each do |pair|
+        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
+        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
+        opened = 0
+        tokenize(text).each do |token|
+          opened += 1 if token.match(starter)
+          opened -= 1 if token.match(ender)
+          return true if opened < 0 # Too many ends!
+        end
+        return true if opened != 0 # Mismatch somewhere.
+      end
+      false
+    end
+    # Determine if a2 is a subsequence of a1
+    # @param a1 [Array]
+    # @param a2 [Array]
+    # @return [Boolean]
+    def self.subseq?(a1, a2)
+      !a1.each_index.find do |i|
+        a1[i...i+a2.length] == a2
+      end.nil?
+    end
+  end
+end

data/lib/twitter_ebooks/suffix.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# encoding: utf-8
+module Ebooks
+  # This generator uses data similar to a Markov model, but
+  # instead of making a chain by looking up bigrams it uses the
+  # positions to randomly replace token array suffixes in one sentence
+  # with matching suffixes in another
+  class SuffixGenerator
+    # Build a generator from a corpus of tikified sentences
+    # "tikis" are token indexes-- a way of representing words
+    # and punctuation as their integer position in a big array
+    # of such tokens
+    # @param sentences [Array<Array<Integer>>]
+    # @return [SuffixGenerator]
+    def self.build(sentences)
+      SuffixGenerator.new(sentences)
+    end
+    def initialize(sentences)
+      @sentences = sentences.reject { |s| s.empty? }
+      @unigrams = {}
+      @bigrams = {}
+      @sentences.each_with_index do |tikis, i|
+        if (i % 10000 == 0) then
+          log ("Building: sentence #{i} of #{sentences.length}")
+        end
+        last_tiki = INTERIM
+        tikis.each_with_index do |tiki, j|
+          @unigrams[last_tiki] ||= []
+          @unigrams[last_tiki] << [i, j]
+          @bigrams[last_tiki] ||= {}
+          @bigrams[last_tiki][tiki] ||= []
+          if j == tikis.length-1 # Mark sentence endings
+            @unigrams[tiki] ||= []
+            @unigrams[tiki] << [i, INTERIM]
+            @bigrams[last_tiki][tiki] << [i, INTERIM]
+          else
+            @bigrams[last_tiki][tiki] << [i, j+1]
+          end
+          last_tiki = tiki
+        end
+      end
+      self
+    end
+    # Generate a recombined sequence of tikis
+    # @param passes [Integer] number of times to recombine
+    # @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
+    # @return [Array<Integer>]
+    def generate(passes=5, n=:unigrams)
+      index = rand(@sentences.length)
+      tikis = @sentences[index]
+      used = [index] # Sentences we've already used
+      verbatim = [tikis] # Verbatim sentences to avoid reproducing
+      0.upto(passes-1) do
+        varsites = {} # Map bigram start site => next tiki alternatives
+        tikis.each_with_index do |tiki, i|
+          next_tiki = tikis[i+1]
+          break if next_tiki.nil?
+          alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
+          # Filter out suffixes from previous sentences
+          alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
+          varsites[i] = alternatives unless alternatives.empty?
+        end
+        variant = nil
+        varsites.to_a.shuffle.each do |site|
+          start = site[0]
+          site[1].shuffle.each do |alt|
+            verbatim << @sentences[alt[0]]
+            suffix = @sentences[alt[0]][alt[1]..-1]
+            potential = tikis[0..start+1] + suffix
+            # Ensure we're not just rebuilding some segment of another sentence
+            unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
+              used << alt[0]
+              variant = potential
+              break
+            end
+          end
+          break if variant
+        end
+        # If we failed to produce a variation from any alternative, there
+        # is no use running additional passes-- they'll have the same result.
+        break if variant.nil?
+        tikis = variant
+      end
+      tikis
+    end
+  end
+end

data/lib/twitter_ebooks/sync.rb ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'twitter'
+require 'json'
+require 'mini_magick'
+require 'open-uri'
+require 'pry'
+module Ebooks
+  class Sync
+    def self.run(botname, username)
+      bot = Ebooks::Bot.get(botname)
+      bot.configure
+      source_user = username
+      ebooks_user = bot.username
+      user = bot.twitter.user(source_user)
+      if user.profile_image_url then
+        Ebooks::Sync::get(user.profile_image_url(:original), "image/#{source_user}_avatar")
+        avatar = MiniMagick::Image.open("image/#{source_user}_avatar")
+        avatar.flip
+        avatar.write("image/#{ebooks_user}_avatar")
+        avatar64 = Base64.encode64(File.read("image/#{ebooks_user}_avatar"))
+        bot.twitter.update_profile_image(avatar64)
+        p "Updated profile image for #{ebooks_user} from #{source_user}."
+      else
+        p "#{source_user} does not have a profile image to clone."
+      end
+      if user.profile_banner_url then
+        Ebooks::Sync::get(user.profile_banner_url, "image/#{source_user}banner")
+        banner = MiniMagick::Image.open("image/#{source_user}banner")
+        banner.flip
+        banner.write("image/#{ebooks_user}_banner")
+        banner64 = Base64.encode64(File.read("image/#{ebooks_user}_banner"))
+        bot.twitter.update_profile_banner(banner64)
+        p "Updated cover image for #{ebooks_user} from #{source_user}."
+      else
+        p "#{source_user} does not have a cover image to clone."
+      end
+    end
+    def self.get(url, destination)
+      File.open(destination, "wb") do |saved_file|
+        open(url, "rb") do |read_file|
+          saved_file.write(read_file.read)
+        end
+      end
+    end
+  end
+end

data/lib/twitter_ebooks/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Ebooks
+  VERSION = "3.2.0"
+end