RubyGems - twitter_ebooks - Versions diffs - 2.3.2 → 3.0.0 - Mend

twitter_ebooks 2.3.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/.travis.yml +7 -0
data/README.md +60 -30
data/bin/ebooks +239 -117
data/lib/twitter_ebooks.rb +2 -2
data/lib/twitter_ebooks/archive.rb +12 -9
data/lib/twitter_ebooks/bot.rb +343 -109
data/lib/twitter_ebooks/model.rb +104 -22
data/lib/twitter_ebooks/nlp.rb +46 -13
data/lib/twitter_ebooks/suffix.rb +9 -1
data/lib/twitter_ebooks/version.rb +1 -1
data/skeleton/Gemfile +1 -1
data/skeleton/Procfile +1 -1
data/skeleton/bots.rb +35 -22
data/spec/bot_spec.rb +178 -0
data/spec/model_spec.rb +18 -2
data/twitter_ebooks.gemspec +7 -3
metadata +72 -20
data/lib/twitter_ebooks/markov.rb +0 -82
data/skeleton/run.rb +0 -9
data/test/corpus/0xabad1dea.tweets +0 -14696
data/test/keywords.rb +0 -18
data/test/tokenize.rb +0 -18

data/lib/twitter_ebooks/model.rb CHANGED

@@ -8,12 +8,41 @@ require 'csv'
 module Ebooks
   class Model
-    attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
+    # @return [Array<String>]
+    # An array of unique tokens. This is the main source of actual strings
+    # in the model. Manipulation of a token is done using its index
+    # in this array, which we call a "tiki"
+    attr_accessor :tokens
+    # @return [Array<Array<Integer>>]
+    # Sentences represented by arrays of tikis
+    attr_accessor :sentences
+    # @return [Array<Array<Integer>>]
+    # Sentences derived from Twitter mentions
+    attr_accessor :mentions
+    # @return [Array<String>]
+    # The top 200 most important keywords, in descending order
+    attr_accessor :keywords
+    # Generate a new model from a corpus file
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.consume(path)
+      Model.new.consume(path)
+    end
-    def self.consume(txtpath)
-      Model.new.consume(txtpath)
+    # Generate a new model from multiple corpus files
+    # @param paths [Array<String>]
+    # @return [Ebooks::Model]
+    def self.consume_all(paths)
+      Model.new.consume_all(paths)
     end
+    # Load a saved model
+    # @param path [String]
+    # @return [Ebooks::Model]
     def self.load(path)
       model = Model.new
       model.instance_eval do
@@ -26,6 +55,8 @@ module Ebooks
       model
     end
+    # Save model to a file
+    # @param path [String]
     def save(path)
       File.open(path, 'wb') do |f|
         f.write(Marshal.dump({
@@ -39,19 +70,22 @@ module Ebooks
     end
     def initialize
-      # This is the only source of actual strings in the model. It is
-      # an array of unique tokens. Manipulation of a token is mostly done
-      # using its index in this array, which we call a "tiki"
       @tokens = []
       # Reverse lookup tiki by token, for faster generation
       @tikis = {}
     end
+    # Reverse lookup a token index from a token
+    # @param token [String]
+    # @return [Integer]
     def tikify(token)
       @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
     end
+    # Convert a body of text into arrays of tikis
+    # @param text [String]
+    # @return [Array<Array<Integer>>]
     def mass_tikify(text)
       sentences = NLP.sentences(text)
@@ -65,9 +99,10 @@ module Ebooks
       end
     end
+    # Consume a corpus into this model
+    # @param path [String]
     def consume(path)
       content = File.read(path, :encoding => 'utf-8')
-      @hash = Digest::MD5.hexdigest(content)
       if path.split('.')[-1] == "json"
         log "Reading json corpus from #{path}"
@@ -87,6 +122,12 @@ module Ebooks
         lines = content.split("\n")
       end
+      consume_lines(lines)
+    end
+    # Consume a sequence of lines
+    # @param lines [Array<String>]
+    def consume_lines(lines)
       log "Removing commented lines and sorting mentions"
       statements = []
@@ -113,30 +154,62 @@ module Ebooks
       @mentions = mass_tikify(mention_text)
       log "Ranking keywords"
-      @keywords = NLP.keywords(text)
+      @keywords = NLP.keywords(text).top(200).map(&:to_s)
       self
     end
-    def fix(tweet)
-      # This seems to require an external api call
-      #begin
-      #  fixer = NLP.gingerice.parse(tweet)
-      #  log fixer if fixer['corrections']
-      #  tweet = fixer['result']
-      #rescue Exception => e
-      #  log e.message
-      #  log e.backtrace
-      #end
+    # Consume multiple corpuses into this model
+    # @param paths [Array<String>]
+    def consume_all(paths)
+      lines = []
+      paths.each do |path|
+        content = File.read(path, :encoding => 'utf-8')
+        if path.split('.')[-1] == "json"
+          log "Reading json corpus from #{path}"
+          l = JSON.parse(content).map do |tweet|
+            tweet['text']
+          end
+          lines.concat(l)
+        elsif path.split('.')[-1] == "csv"
+          log "Reading CSV corpus from #{path}"
+          content = CSV.parse(content)
+          header = content.shift
+          text_col = header.index('text')
+          l = content.map do |tweet|
+            tweet[text_col]
+          end
+          lines.concat(l)
+        else
+          log "Reading plaintext corpus from #{path}"
+          l = content.split("\n")
+          lines.concat(l)
+        end
+      end
+      consume_lines(lines)
+    end
-      NLP.htmlentities.decode tweet
+    # Correct encoding issues in generated text
+    # @param text [String]
+    # @return [String]
+    def fix(text)
+      NLP.htmlentities.decode text
     end
+    # Check if an array of tikis comprises a valid tweet
+    # @param tikis [Array<Integer>]
+    # @param limit Integer how many chars we have left
     def valid_tweet?(tikis, limit)
       tweet = NLP.reconstruct(tikis, @tokens)
       tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
     end
+    # Generate some text
+    # @param limit [Integer] available characters
+    # @param generator [SuffixGenerator, nil]
+    # @param retry_limit [Integer] how many times to retry on duplicates
+    # @return [String]
     def make_statement(limit=140, generator=nil, retry_limit=10)
       responding = !generator.nil?
       generator ||= SuffixGenerator.build(@sentences)
@@ -171,12 +244,17 @@ module Ebooks
     end
     # Test if a sentence has been copied verbatim from original
-    def verbatim?(tokens)
-      @sentences.include?(tokens) || @mentions.include?(tokens)
+    # @param tikis [Array<Integer>]
+    # @return [Boolean]
+    def verbatim?(tikis)
+      @sentences.include?(tikis) || @mentions.include?(tikis)
     end
-    # Finds all relevant tokenized sentences to given input by
+    # Finds relevant and slightly relevant tokenized sentences to input
     # comparing non-stopword token overlaps
+    # @param sentences [Array<Array<Integer>>]
+    # @param input [String]
+    # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
     def find_relevant(sentences, input)
       relevant = []
       slightly_relevant = []
@@ -197,6 +275,10 @@ module Ebooks
     # Generates a response by looking for related sentences
     # in the corpus and building a smaller generator from these
+    # @param input [String]
+    # @param limit [Integer] characters available for response
+    # @param sentences [Array<Array<Integer>>]
+    # @return [String]
     def make_response(input, limit=140, sentences=@mentions)
       # Prefer mentions
       relevant, slightly_relevant = find_relevant(sentences, input)

data/lib/twitter_ebooks/nlp.rb CHANGED

@@ -12,31 +12,35 @@ module Ebooks
     # Some of this stuff is pretty heavy and we don't necessarily need
     # to be using it all of the time
+    # Lazily loads an array of stopwords
+    # Stopwords are common English words that should often be ignored
+    # @return [Array<String>]
     def self.stopwords
       @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
     end
+    # Lazily loads an array of known English nouns
+    # @return [Array<String>]
     def self.nouns
       @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
     end
+    # Lazily loads an array of known English adjectives
+    # @return [Array<String>]
     def self.adjectives
       @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
     end
-    # POS tagger
+    # Lazily load part-of-speech tagging library
+    # This can determine whether a word is being used as a noun/adjective/verb
+    # @return [EngTagger]
     def self.tagger
       require 'engtagger'
       @tagger ||= EngTagger.new
     end
-    # Gingerice text correction service
-    def self.gingerice
-      require 'gingerice'
-      Gingerice::Parser.new # No caching for this one
-    end
-    # For decoding html entities
+    # Lazily load HTML entity decoder
+    # @return [HTMLEntities]
     def self.htmlentities
       require 'htmlentities'
       @htmlentities ||= HTMLEntities.new
@@ -44,7 +48,9 @@ module Ebooks
     ### Utility functions
-    # We don't really want to deal with all this weird unicode punctuation
+    # Normalize some strange unicode punctuation variants
+    # @param text [String]
+    # @return [String]
     def self.normalize(text)
       htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
     end
@@ -53,6 +59,8 @@ module Ebooks
     # We use ad hoc approach because fancy libraries do not deal
     # especially well with tweet formatting, and we can fake solving
     # the quote problem during generation
+    # @param text [String]
+    # @return [Array<String>]
     def self.sentences(text)
       text.split(/\n+|(?<=[.?!])\s+/)
     end
@@ -60,15 +68,23 @@ module Ebooks
     # Split a sentence into word-level tokens
     # As above, this is ad hoc because tokenization libraries
     # do not behave well wrt. things like emoticons and timestamps
+    # @param sentence [String]
+    # @return [Array<String>]
     def self.tokenize(sentence)
       regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
       sentence.split(regex)
     end
+    # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
+    # @param word [String]
+    # @return [String]
     def self.stem(word)
       Stemmer::stem_word(word.downcase)
     end
+    # Use highscore gem to find interesting keywords in a corpus
+    # @param text [String]
+    # @return [Highscore::Keywords]
     def self.keywords(text)
       # Preprocess to remove stopwords (highscore's blacklist is v. slow)
       text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
@@ -90,7 +106,10 @@ module Ebooks
       text.keywords
     end
-    # Takes a list of tokens and builds a nice-looking sentence
+    # Builds a proper sentence from a list of tikis
+    # @param tikis [Array<Integer>]
+    # @param tokens [Array<String>]
+    # @return [String]
     def self.reconstruct(tikis, tokens)
       text = ""
       last_token = nil
@@ -105,6 +124,9 @@ module Ebooks
     end
     # Determine if we need to insert a space between two tokens
+    # @param token1 [String]
+    # @param token2 [String]
+    # @return [Boolean]
     def self.space_between?(token1, token2)
       p1 = self.punctuation?(token1)
       p2 = self.punctuation?(token2)
@@ -119,10 +141,16 @@ module Ebooks
       end
     end
+    # Is this token comprised of punctuation?
+    # @param token [String]
+    # @return [Boolean]
     def self.punctuation?(token)
       (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
     end
+    # Is this token a stopword?
+    # @param token [String]
+    # @return [Boolean]
     def self.stopword?(token)
       @stopword_set ||= stopwords.map(&:downcase).to_set
       @stopword_set.include?(token.downcase)
@@ -130,7 +158,9 @@ module Ebooks
     # Determine if a sample of text contains unmatched brackets or quotes
     # This is one of the more frequent and noticeable failure modes for
-    # the markov generator; we can just tell it to retry
+    # the generator; we can just tell it to retry
+    # @param text [String]
+    # @return [Boolean]
     def self.unmatched_enclosers?(text)
       enclosers = ['**', '""', '()', '[]', '``', "''"]
       enclosers.each do |pair|
@@ -153,10 +183,13 @@ module Ebooks
     end
     # Determine if a2 is a subsequence of a1
+    # @param a1 [Array]
+    # @param a2 [Array]
+    # @return [Boolean]
     def self.subseq?(a1, a2)
-      a1.each_index.find do |i|
+      !a1.each_index.find do |i|
         a1[i...i+a2.length] == a2
-      end
+      end.nil?
     end
   end
 end

data/lib/twitter_ebooks/suffix.rb CHANGED

@@ -1,11 +1,14 @@
 # encoding: utf-8
 module Ebooks
-  # This generator uses data identical to the markov model, but
+  # This generator uses data identical to a markov model, but
   # instead of making a chain by looking up bigrams it uses the
   # positions to randomly replace suffixes in one sentence with
   # matching suffixes in another
   class SuffixGenerator
+    # Build a generator from a corpus of tikified sentences
+    # @param sentences [Array<Array<Integer>>]
+    # @return [SuffixGenerator]
     def self.build(sentences)
       SuffixGenerator.new(sentences)
     end
@@ -39,6 +42,11 @@ module Ebooks
       self
     end
+    # Generate a recombined sequence of tikis
+    # @param passes [Integer] number of times to recombine
+    # @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
+    # @return [Array<Integer>]
     def generate(passes=5, n=:unigrams)
       index = rand(@sentences.length)
       tikis = @sentences[index]

data/lib/twitter_ebooks/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Ebooks
-  VERSION = "2.3.2"
+  VERSION = "3.0.0"
 end

data/skeleton/Gemfile CHANGED

@@ -1,4 +1,4 @@
 source 'http://rubygems.org'
-ruby '1.9.3'
+ruby '{{RUBY_VERSION}}'
 gem 'twitter_ebooks'

data/skeleton/Procfile CHANGED

	@@ -1 +1 @@
1	- worker: ~~ruby~~ ~~run.rb~~ start
1	+ worker: bundle exec ebooks start

data/skeleton/bots.rb CHANGED

@@ -1,42 +1,55 @@
-#!/usr/bin/env ruby
 require 'twitter_ebooks'
 # This is an example bot definition with event handlers commented out
-# You can define as many of these as you like; they will run simultaneously
+# You can define and instantiate as many bots as you like
+class MyBot < Ebooks::Bot
+  # Configuration here applies to all MyBots
+  def configure
+    # Consumer details come from registering an app at https://dev.twitter.com/
+    # Once you have consumer details, use "ebooks auth" for new access tokens
+    self.consumer_key = '' # Your app consumer key
+    self.consumer_secret = '' # Your app consumer secret
+    # Users to block instead of interacting with
+    self.blacklist = ['tnietzschequote']
+    # Range in seconds to randomize delay when bot.delay is called
+    self.delay_range = 1..6
+  end
-Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
-  # Consumer details come from registering an app at https://dev.twitter.com/
-  # OAuth details can be fetched with https://github.com/marcel/twurl
-  bot.consumer_key = "" # Your app consumer key
-  bot.consumer_secret = "" # Your app consumer secret
-  bot.oauth_token = "" # Token connecting the app to this account
-  bot.oauth_token_secret = "" # Secret connecting the app to this account
+  def on_startup
+    scheduler.every '24h' do
+      # Tweet something every 24 hours
+      # See https://github.com/jmettraux/rufus-scheduler
+      # bot.tweet("hi")
+      # bot.pictweet("hi", "cuteselfie.jpg")
+    end
+  end
-  bot.on_message do |dm|
+  def on_message(dm)
     # Reply to a DM
     # bot.reply(dm, "secret secrets")
   end
-  bot.on_follow do |user|
+  def on_follow(user)
     # Follow a user back
     # bot.follow(user[:screen_name])
   end
-  bot.on_mention do |tweet, meta|
+  def on_mention(tweet)
     # Reply to a mention
-    # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
+    # bot.reply(tweet, meta(tweet)[:reply_prefix] + "oh hullo")
   end
-  bot.on_timeline do |tweet, meta|
+  def on_timeline(tweet)
     # Reply to a tweet in the bot's timeline
-    # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
+    # bot.reply(tweet, meta(tweet)[:reply_prefix] + "nice tweet")
   end
+end
-  bot.scheduler.every '24h' do
-    # Tweet something every 24 hours
-    # See https://github.com/jmettraux/rufus-scheduler
-    # bot.tweet("hi")
-    # bot.pictweet("hi", "cuteselfie.jpg", ":possibly_sensitive => true")
-  end
+# Make a MyBot and attach it to an account
+MyBot.new("{{BOT_NAME}}") do |bot|
+  bot.access_token = "" # Token connecting the app to this account
+  bot.access_token_secret = "" # Secret connecting the app to this account
 end