RubyGems - bot_twitter_ebooks - Versions diffs - 0.0.0 - Mend

bot_twitter_ebooks 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +7 -0
data/.gitattributes +1 -0
data/.gitignore +200 -0
data/.rspec +1 -0
data/.travis.yml +7 -0
data/Gemfile +4 -0
data/LICENSE +22 -0
data/README.md +168 -0
data/Rakefile +2 -0
data/bin/ebooks +454 -0
data/bot_twitter_ebooks.gemspec +38 -0
data/data/adjectives.txt +1466 -0
data/data/nouns.txt +2193 -0
data/lib/bot_twitter_ebooks.rb +22 -0
data/lib/bot_twitter_ebooks/archive.rb +117 -0
data/lib/bot_twitter_ebooks/bot.rb +481 -0
data/lib/bot_twitter_ebooks/model.rb +336 -0
data/lib/bot_twitter_ebooks/nlp.rb +195 -0
data/lib/bot_twitter_ebooks/suffix.rb +104 -0
data/lib/bot_twitter_ebooks/sync.rb +52 -0
data/lib/bot_twitter_ebooks/version.rb +3 -0
data/skeleton/Gemfile +4 -0
data/skeleton/Procfile +1 -0
data/skeleton/bots.rb +66 -0
data/skeleton/corpus/.gitignore +0 -0
data/skeleton/gitignore +1 -0
data/skeleton/image/.gitignore +0 -0
data/skeleton/model/.gitignore +0 -0
data/skeleton/stopwords.txt +843 -0
data/spec/bot_spec.rb +216 -0
data/spec/data/elonmusk.json +12866 -0
data/spec/data/elonmusk.model +2414 -0
data/spec/memprof.rb +37 -0
data/spec/model_spec.rb +88 -0
data/spec/spec_helper.rb +6 -0
metadata +310 -0

data/lib/bot_twitter_ebooks/model.rb ADDED

@@ -0,0 +1,336 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+require 'json'
+require 'set'
+require 'digest/md5'
+require 'csv'
+module Ebooks
+  class Model
+    # @return [Array<String>]
+    # An array of unique tokens. This is the main source of actual strings
+    # in the model. Manipulation of a token is done using its index
+    # in this array, which we call a "tiki"
+    attr_accessor :tokens
+    # @return [Array<Array<Integer>>]
+    # Sentences represented by arrays of tikis
+    attr_accessor :sentences
+    # @return [Array<Array<Integer>>]
+    # Sentences derived from Twitter mentions
+    attr_accessor :mentions
+    # @return [Array<String>]
+    # The top 200 most important keywords, in descending order
+    attr_accessor :keywords
+    # Generate a new model from a corpus file
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.consume(path)
+      Model.new.consume(path)
+    end
+    # Generate a new model from multiple corpus files
+    # @param paths [Array<String>]
+    # @return [Ebooks::Model]
+    def self.consume_all(paths)
+      Model.new.consume_all(paths)
+    end
+    # Load a saved model
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.load(path)
+      model = Model.new
+      model.instance_eval do
+        props = Marshal.load(File.open(path, 'rb') { |f| f.read })
+        @tokens = props[:tokens]
+        @sentences = props[:sentences]
+        @mentions = props[:mentions]
+        @keywords = props[:keywords]
+      end
+      model
+    end
+    # Save model to a file
+    # @param path [String]
+    def save(path)
+      File.open(path, 'wb') do |f|
+        f.write(Marshal.dump({
+          tokens: @tokens,
+          sentences: @sentences,
+          mentions: @mentions,
+          keywords: @keywords
+        }))
+      end
+      self
+    end
+    # Append a generated model to existing model file instead of overwriting it
+    # @param path [String]
+    def append(path)
+      existing = File.file?(path)
+      if !existing
+        log "No existing model found at #{path}"
+        return
+      else
+        #read-in and deserialize existing model
+        props = Marshal.load(File.open(path,'rb') { |old| old.read })
+        old_tokens = props[:tokens]
+        old_sentences = props[:sentences]
+        old_mentions = props[:mentions]
+        old_keywords = props[:keywords]
+        #append existing properties to new ones and overwrite with new model
+        File.open(path, 'wb') do |f|
+          f.write(Marshal.dump({
+            tokens: @tokens.concat(old_tokens),
+            sentences: @sentences.concat(old_sentences),
+            mentions: @mentions.concat(old_mentions),
+            keywords: @keywords.concat(old_keywords)
+          }))
+        end
+      end
+      self
+    end
+    def initialize
+      @tokens = []
+      # Reverse lookup tiki by token, for faster generation
+      @tikis = {}
+    end
+    # Reverse lookup a token index from a token
+    # @param token [String]
+    # @return [Integer]
+    def tikify(token)
+      if @tikis.has_key?(token) then
+        return @tikis[token]
+      else
+        (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
+        @tokens << token
+        return @tikis[token] = @tokens.length-1
+      end
+    end
+    # Convert a body of text into arrays of tikis
+    # @param text [String]
+    # @return [Array<Array<Integer>>]
+    def mass_tikify(text)
+      sentences = NLP.sentences(text)
+      sentences.map do |s|
+        tokens = NLP.tokenize(s).reject do |t|
+          # Don't include usernames/urls as tokens
+          t.include?('@') || t.include?('http')
+        end
+        tokens.map { |t| tikify(t) }
+      end
+    end
+    # Consume a corpus into this model
+    # @param path [String]
+    def consume(path)
+      content = File.read(path, :encoding => 'utf-8')
+      if path.split('.')[-1] == "json"
+        log "Reading json corpus from #{path}"
+        lines = JSON.parse(content).map do |tweet|
+          tweet['text']
+        end
+      elsif path.split('.')[-1] == "csv"
+        log "Reading CSV corpus from #{path}"
+        content = CSV.parse(content)
+        header = content.shift
+        text_col = header.index('text')
+        lines = content.map do |tweet|
+          tweet[text_col]
+        end
+      else
+        log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
+        lines = content.split("\n")
+      end
+      consume_lines(lines)
+    end
+    # Consume a sequence of lines
+    # @param lines [Array<String>]
+    def consume_lines(lines)
+      log "Removing commented lines and sorting mentions"
+      statements = []
+      mentions = []
+      lines.each do |l|
+        next if l.start_with?('#') # Remove commented lines
+        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
+        if l.include?('@')
+          mentions << NLP.normalize(l)
+        else
+          statements << NLP.normalize(l)
+        end
+      end
+      text = statements.join("\n").encode('UTF-8', :invalid => :replace)
+      mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
+      lines = nil; statements = nil; mentions = nil # Allow garbage collection
+      log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
+      @sentences = mass_tikify(text)
+      @mentions = mass_tikify(mention_text)
+      log "Ranking keywords"
+      @keywords = NLP.keywords(text).top(200).map(&:to_s)
+      log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
+      self
+    end
+    # Consume multiple corpuses into this model
+    # @param paths [Array<String>]
+    def consume_all(paths)
+      lines = []
+      paths.each do |path|
+        content = File.read(path, :encoding => 'utf-8')
+        if path.split('.')[-1] == "json"
+          log "Reading json corpus from #{path}"
+          l = JSON.parse(content).map do |tweet|
+            tweet['text']
+          end
+          lines.concat(l)
+        elsif path.split('.')[-1] == "csv"
+          log "Reading CSV corpus from #{path}"
+          content = CSV.parse(content)
+          header = content.shift
+          text_col = header.index('text')
+          l = content.map do |tweet|
+            tweet[text_col]
+          end
+          lines.concat(l)
+        else
+          log "Reading plaintext corpus from #{path}"
+          l = content.split("\n")
+          lines.concat(l)
+        end
+      end
+      consume_lines(lines)
+    end
+    # Correct encoding issues in generated text
+    # @param text [String]
+    # @return [String]
+    def fix(text)
+      NLP.htmlentities.decode text
+    end
+    # Check if an array of tikis comprises a valid tweet
+    # @param tikis [Array<Integer>]
+    # @param limit Integer how many chars we have left
+    def valid_tweet?(tikis, limit)
+      tweet = NLP.reconstruct(tikis, @tokens)
+      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
+    end
+    # Generate some text
+    # @param limit [Integer] available characters
+    # @param generator [SuffixGenerator, nil]
+    # @param retry_limit [Integer] how many times to retry on invalid tweet
+    # @return [String]
+    def make_statement(limit=140, generator=nil, retry_limit=10)
+      responding = !generator.nil?
+      generator ||= SuffixGenerator.build(@sentences)
+      retries = 0
+      tweet = ""
+      while (tikis = generator.generate(3, :bigrams)) do
+        #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
+        break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
+        retries += 1
+        break if retries >= retry_limit
+      end
+      if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
+        #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
+        while (tikis = generator.generate(3, :unigrams)) do
+          break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
+          retries += 1
+          break if retries >= retry_limit
+        end
+      end
+      tweet = NLP.reconstruct(tikis, @tokens)
+      if retries >= retry_limit
+        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
+      end
+      fix tweet
+    end
+    # Test if a sentence has been copied verbatim from original
+    # @param tikis [Array<Integer>]
+    # @return [Boolean]
+    def verbatim?(tikis)
+      @sentences.include?(tikis) || @mentions.include?(tikis)
+    end
+    # Finds relevant and slightly relevant tokenized sentences to input
+    # comparing non-stopword token overlaps
+    # @param sentences [Array<Array<Integer>>]
+    # @param input [String]
+    # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
+    def find_relevant(sentences, input)
+      relevant = []
+      slightly_relevant = []
+      tokenized = NLP.tokenize(input).map(&:downcase)
+      sentences.each do |sent|
+        tokenized.each do |token|
+          if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
+            relevant << sent unless NLP.stopword?(token)
+            slightly_relevant << sent
+          end
+        end
+      end
+      [relevant, slightly_relevant]
+    end
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller generator from these
+    # @param input [String]
+    # @param limit [Integer] characters available for response
+    # @param sentences [Array<Array<Integer>>]
+    # @return [String]
+    def make_response(input, limit=140, sentences=@mentions)
+      # Prefer mentions
+      relevant, slightly_relevant = find_relevant(sentences, input)
+      if relevant.length >= 3
+        generator = SuffixGenerator.build(relevant)
+        make_statement(limit, generator)
+      elsif slightly_relevant.length >= 5
+        generator = SuffixGenerator.build(slightly_relevant)
+        make_statement(limit, generator)
+      elsif sentences.equal?(@mentions)
+        make_response(input, limit, @sentences)
+      else
+        make_statement(limit)
+      end
+    end
+  end
+end

data/lib/bot_twitter_ebooks/nlp.rb ADDED

@@ -0,0 +1,195 @@
+# encoding: utf-8
+require 'fast-stemmer'
+require 'highscore'
+require 'htmlentities'
+module Ebooks
+  module NLP
+    # We deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of another token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+    # Lazy-load NLP libraries and resources
+    # Some of this stuff is pretty heavy and we don't necessarily need
+    # to be using it all of the time
+    # Lazily loads an array of stopwords
+    # Stopwords are common words that should often be ignored
+    # @return [Array<String>]
+    def self.stopwords
+      @stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
+    end
+    # Lazily loads an array of known English nouns
+    # @return [Array<String>]
+    def self.nouns
+      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
+    end
+    # Lazily loads an array of known English adjectives
+    # @return [Array<String>]
+    def self.adjectives
+      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
+    end
+    # Lazily load part-of-speech tagging library
+    # This can determine whether a word is being used as a noun/adjective/verb
+    # @return [EngTagger]
+    def self.tagger
+      require 'engtagger'
+      @tagger ||= EngTagger.new
+    end
+    # Lazily load HTML entity decoder
+    # @return [HTMLEntities]
+    def self.htmlentities
+      @htmlentities ||= HTMLEntities.new
+    end
+    ### Utility functions
+    # Normalize some strange unicode punctuation variants
+    # @param text [String]
+    # @return [String]
+    def self.normalize(text)
+      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
+    end
+    # Split text into sentences
+    # We use ad hoc approach because fancy libraries do not deal
+    # especially well with tweet formatting, and we can fake solving
+    # the quote problem during generation
+    # @param text [String]
+    # @return [Array<String>]
+    def self.sentences(text)
+      text.split(/\n+|(?<=[.?!])\s+/)
+    end
+    # Split a sentence into word-level tokens
+    # As above, this is ad hoc because tokenization libraries
+    # do not behave well wrt. things like emoticons and timestamps
+    # @param sentence [String]
+    # @return [Array<String>]
+    def self.tokenize(sentence)
+      regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
+      sentence.split(regex)
+    end
+    # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
+    # @param word [String]
+    # @return [String]
+    def self.stem(word)
+      Stemmer::stem_word(word.downcase)
+    end
+    # Use highscore gem to find interesting keywords in a corpus
+    # @param text [String]
+    # @return [Highscore::Keywords]
+    def self.keywords(text)
+      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
+      text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
+      text = Highscore::Content.new(text)
+      text.configure do
+        #set :multiplier, 2
+        #set :upper_case, 3
+        #set :long_words, 2
+        #set :long_words_threshold, 15
+        #set :vowels, 1                     # => default: 0 = not considered
+        #set :consonants, 5                 # => default: 0 = not considered
+        #set :ignore_case, true             # => default: false
+        set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/           # => default: /\w+/
+        #set :stemming, true                # => default: false
+      end
+      text.keywords
+    end
+    # Builds a proper sentence from a list of tikis
+    # @param tikis [Array<Integer>]
+    # @param tokens [Array<String>]
+    # @return [String]
+    def self.reconstruct(tikis, tokens)
+      text = ""
+      last_token = nil
+      tikis.each do |tiki|
+        next if tiki == INTERIM
+        token = tokens[tiki]
+        text += ' ' if last_token && space_between?(last_token, token)
+        text += token
+        last_token = token
+      end
+      text
+    end
+    # Determine if we need to insert a space between two tokens
+    # @param token1 [String]
+    # @param token2 [String]
+    # @return [Boolean]
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
+    # Is this token comprised of punctuation?
+    # @param token [String]
+    # @return [Boolean]
+    def self.punctuation?(token)
+      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
+    end
+    # Is this token a stopword?
+    # @param token [String]
+    # @return [Boolean]
+    def self.stopword?(token)
+      @stopword_set ||= stopwords.map(&:downcase).to_set
+      @stopword_set.include?(token.downcase)
+    end
+    # Determine if a sample of text contains unmatched brackets or quotes
+    # This is one of the more frequent and noticeable failure modes for
+    # the generator; we can just tell it to retry
+    # @param text [String]
+    # @return [Boolean]
+    def self.unmatched_enclosers?(text)
+      enclosers = ['**', '""', '()', '[]', '``', "''"]
+      enclosers.each do |pair|
+        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
+        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
+        opened = 0
+        tokenize(text).each do |token|
+          opened += 1 if token.match(starter)
+          opened -= 1 if token.match(ender)
+          return true if opened < 0 # Too many ends!
+        end
+        return true if opened != 0 # Mismatch somewhere.
+      end
+      false
+    end
+    # Determine if a2 is a subsequence of a1
+    # @param a1 [Array]
+    # @param a2 [Array]
+    # @return [Boolean]
+    def self.subseq?(a1, a2)
+      !a1.each_index.find do |i|
+        a1[i...i+a2.length] == a2
+      end.nil?
+    end
+  end
+end