twitter_ebooks 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+ require 'twitter'
3
+ require 'tweetstream'
4
+ require 'rufus/scheduler'
5
+
6
+ module Ebooks
7
+ class Bot
8
+ attr_accessor :consumer_key, :consumer_secret,
9
+ :oauth_token, :oauth_token_secret
10
+
11
+ attr_accessor :username
12
+
13
+ attr_reader :twitter, :stream
14
+
15
+ @@all = [] # List of all defined bots
16
+ def self.all; @@all; end
17
+
18
+ def initialize(username, &b)
19
+ # Set defaults
20
+ @username = username
21
+
22
+ # Override with callback
23
+ b.call(self)
24
+
25
+ Bot.all.push(self)
26
+ end
27
+
28
+ def log(*args)
29
+ STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
30
+ STDERR.flush
31
+ end
32
+
33
+ # Connects to tweetstream and opens event handlers for this bot
34
+ def start
35
+ TweetStream.configure do |config|
36
+ config.consumer_key = @consumer_key
37
+ config.consumer_secret = @consumer_secret
38
+ config.oauth_token = @oauth_token
39
+ config.oauth_token_secret = @oauth_token_secret
40
+ end
41
+
42
+ Twitter.configure do |config|
43
+ config.consumer_key = @consumer_key
44
+ config.consumer_secret = @consumer_secret
45
+ config.oauth_token = @oauth_token
46
+ config.oauth_token_secret = @oauth_token_secret
47
+ end
48
+
49
+ @twitter = Twitter::Client.new
50
+ @stream = TweetStream::Client.new
51
+
52
+ @stream.on_error do |msg|
53
+ log "ERROR: #{msg}"
54
+ end
55
+
56
+ @stream.on_inited do
57
+ log "Online!"
58
+ end
59
+
60
+ @stream.on_event(:follow) do |event|
61
+ log "Followed by #{event[:source][:screen_name]}"
62
+ @on_follow.call(event[:source])
63
+ end
64
+
65
+ @stream.on_direct_message do |dm|
66
+ next if dm[:sender][:screen_name] == @username # Don't reply to self
67
+ log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
68
+ @on_message.call(dm)
69
+ end
70
+
71
+ @stream.userstream do |ev|
72
+ next unless ev[:text] # If it's not a text-containing tweet, ignore it
73
+ next if ev[:user][:screen_name] == @username # Ignore our own tweets
74
+
75
+ meta = {}
76
+ mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
77
+
78
+ reply_mentions = mentions.reject { |m| m.downcase == @username }
79
+ reply_mentions << ev[:user][:screen_name]
80
+
81
+ meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
82
+
83
+ mless = ev[:text]
84
+ ev.attrs[:entities][:user_mentions].reverse.each do |entity|
85
+ mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
86
+ end
87
+ meta[:mentionless] = mless
88
+
89
+ # To check if this is a mention, ensure:
90
+ # - The tweet mentions list contains our username
91
+ # - The tweet is not being retweeted by somebody else
92
+ # - Or soft-retweeted by somebody else
93
+ if mentions.map(&:downcase).include?(@username) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
94
+ log "Mention from #{ev[:user][:screen_name]}: #{ev[:text]}"
95
+ @on_mention.call(ev, meta)
96
+ else
97
+ @on_timeline.call(ev, meta)
98
+ end
99
+ end
100
+ end
101
+
102
+ # Wrapper for EM.add_timer
103
+ # Delays add a greater sense of humanity to bot behaviour
104
+ def delay(time, &b)
105
+ time = time.to_a.sample unless time.is_a? Integer
106
+ EM.add_timer(time, &b)
107
+ end
108
+
109
+ # Reply to a tweet or a DM.
110
+ # Applies configurable @reply_delay range
111
+ def reply(ev, text, opts={})
112
+ opts = opts.clone
113
+ delay = @reply_delay.to_a.sample
114
+
115
+ if ev.is_a? Twitter::DirectMessage
116
+ log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
117
+ @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
118
+ elsif ev.is_a? Twitter::Tweet
119
+ log "Replying to @#{ev[:user][:screen_name]}: #{text}"
120
+ @twitter.update(text, in_reply_to_status_id: ev[:id])
121
+ else
122
+ raise Exception("Don't know how to reply to a #{ev.class}")
123
+ end
124
+ end
125
+
126
+ def scheduler
127
+ @scheduler ||= Rufus::Scheduler.new
128
+ end
129
+
130
+ def follow(*args)
131
+ log "Following #{args}"
132
+ @twitter.follow(*args)
133
+ end
134
+
135
+ def tweet(*args)
136
+ log "Tweeting #{args.inspect}"
137
+ @twitter.update(*args)
138
+ end
139
+
140
+ def on_follow(&b); @on_follow = b; end
141
+ def on_mention(&b); @on_mention = b; end
142
+ def on_timeline(&b); @on_timeline = b; end
143
+ def on_message(&b); @on_message = b; end
144
+ end
145
+ end
@@ -0,0 +1,89 @@
1
+ module Ebooks
2
+ class MarkovModel
3
+ INTERIM = :interim # Special token marking newline/^/$ boundaries
4
+
5
+ attr_accessor :tokens
6
+ attr_reader :depth
7
+
8
+ def represent(token)
9
+ if token.nil? || token == "\n" || token.empty?
10
+ INTERIM
11
+ else
12
+ token
13
+ end
14
+ end
15
+
16
+ def consume(tokenized, depth=2)
17
+ @tokens = [INTERIM]
18
+ @depth = depth
19
+
20
+ tokenized.each do |tokens|
21
+ @tokens += tokens
22
+ @tokens << INTERIM
23
+ end
24
+
25
+ @model = {}
26
+
27
+ @tokens.each_with_index do |token, i|
28
+ prev_tokens = []
29
+
30
+ @depth.downto(1) do |j|
31
+ if i-j < 0; next
32
+ else; prev = represent(@tokens[i-j])
33
+ end
34
+ prev_tokens << prev
35
+ end
36
+
37
+ 1.upto(@depth) do |j|
38
+ break if j > prev_tokens.length
39
+ ngram = prev_tokens.last(j)
40
+
41
+ unless ngram == INTERIM && prev_tokens[-1] == INTERIM
42
+ @model[ngram] ||= []
43
+ @model[ngram] << represent(token)
44
+ end
45
+ end
46
+ end
47
+
48
+ self
49
+ end
50
+
51
+ def chain(tokens)
52
+ next_token = nil
53
+ @depth.downto(1).each do |i|
54
+ next if tokens.length < i
55
+ matches = @model[tokens.last(i)]
56
+ if matches
57
+ #p tokens.last(i)
58
+ #puts "=> #{matches.inspect}"
59
+ next_token = matches.sample
60
+ break
61
+ end
62
+ end
63
+
64
+ raise ArgumentError if next_token.nil?
65
+
66
+ if next_token == INTERIM
67
+ return tokens
68
+ else
69
+ return chain(tokens + [next_token])
70
+ end
71
+ end
72
+
73
+ def generate
74
+ tokens = chain([@model[[INTERIM]].sample])
75
+ NLP.reconstruct(tokens)
76
+ end
77
+
78
+ def serialize
79
+ { 'model' => @model,
80
+ 'depth' => @depth }
81
+ end
82
+
83
+ def deserialize(data)
84
+ @model = data['model']
85
+ @depth = data['depth']
86
+ self
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,147 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'json'
5
+ require 'set'
6
+ require 'digest/md5'
7
+
8
+ module Ebooks
9
+ class Model
10
+ attr_accessor :hash, :sentences, :tokenized, :markov
11
+
12
+ def self.consume(txtpath)
13
+ Model.new.consume(txtpath)
14
+ end
15
+
16
+ def self.load(path)
17
+ data = Marshal.load(File.read(path))
18
+ Model.new.deserialize(data)
19
+ end
20
+
21
+ def consume(txtpath)
22
+ # Record hash of source file so we know to update later
23
+ @hash = Digest::MD5.hexdigest(File.read(txtpath))
24
+
25
+ text = File.read(txtpath)
26
+ log "Removing commented lines and mentions"
27
+
28
+ lines = text.split("\n")
29
+ keeping = []
30
+ lines.each do |l|
31
+ next if l.start_with?('#') || l.include?('RT')
32
+ processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
33
+ keeping << processed.join(' ')
34
+ end
35
+ text = NLP.normalize(keeping.join("\n"))
36
+
37
+ log "Segmenting text into sentences of 140 characters or less"
38
+ @sentences = NLP.sentences(text).reject do |s|
39
+ s.length > 140 || s.count('"')%2 != 0
40
+ end
41
+
42
+ log "Tokenizing #{@sentences.length} sentences"
43
+ @tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
44
+ @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
45
+
46
+ log "Building markov model (this may take a while)"
47
+ @markov = MarkovModel.new.consume(@tokenized)
48
+
49
+ self
50
+ end
51
+
52
+ # Produces a hash with the data needed to quickly
53
+ # reconstruct this corpus object
54
+ def serialize
55
+ return { 'hash' => @hash,
56
+ 'tokenized' => @tokenized,
57
+ 'tokensets' => @tokensets,
58
+ 'markov' => @markov.serialize }
59
+ end
60
+
61
+ def save(path)
62
+ data = self.serialize
63
+ File.open(path, 'w') do |f|
64
+ f.write(Marshal.dump(data))
65
+ end
66
+ self
67
+ end
68
+
69
+ def deserialize(data)
70
+ @hash = data['hash']
71
+ @tokenized = data['tokenized']
72
+ @tokensets = data['tokensets']
73
+ @markov = MarkovModel.new.deserialize(data['markov'])
74
+ self
75
+ end
76
+
77
+ def replace_noun(sent)
78
+ tagged = NLP.tagger.add_tags(sent)
79
+
80
+ nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
81
+ to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
82
+ return sent if to_replace.nil?
83
+ replacement = NLP.nouns.sample
84
+ if to_replace.en.plural.length <= to_replace.length
85
+ replacement = replacement.en.plural(1)
86
+ end
87
+ sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
88
+ sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
89
+ end
90
+
91
+ def fix(tweet)
92
+ # This seems to require an external api call
93
+ begin
94
+ fixer = NLP.gingerice.parse(tweet)
95
+ log fixer if fixer['corrections']
96
+ tweet = fixer['result']
97
+ rescue Exception => e
98
+ log e.message
99
+ log e.backtrace
100
+ end
101
+
102
+ NLP.htmlentities.decode tweet
103
+ end
104
+
105
+ def markov_statement(limit=140, markov=nil)
106
+ markov ||= @markov
107
+ tweet = ""
108
+
109
+ while (tweet = markov.generate) do
110
+ next if tweet.length > limit
111
+ next if NLP.unmatched_enclosers?(tweet)
112
+ break if tweet.length > limit*0.4 || rand > 0.8
113
+ end
114
+
115
+ fix tweet
116
+ end
117
+
118
+ # Generates a response by looking for related sentences
119
+ # in the corpus and building a smaller markov model from these
120
+ def markov_response(input, limit=140)
121
+ inputset = NLP.tokenset(input)
122
+ log "Input tokenset: #{inputset.to_a}"
123
+
124
+ if inputset.empty?
125
+ # Very uninteresting input; no relevant response possible
126
+ return markov_statement(limit)
127
+ end
128
+
129
+ # Let's find all the sentences that might be relevant
130
+ relevant = []
131
+ @tokensets.each_with_index.map do |set, i|
132
+ if inputset.intersection(set).length > 0
133
+ relevant << @tokenized[i]
134
+ end
135
+ end
136
+
137
+ log "Found #{relevant.length} relevant tokenset matches"
138
+
139
+ if relevant.length < 3
140
+ return markov_statement(limit)
141
+ end
142
+
143
+ markov = MarkovModel.new.consume(relevant.sample(100))
144
+ markov_statement(limit, markov)
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+
3
+ require 'linguistics'
4
+ Linguistics.use(:en, classes: [String])
5
+
6
+ module Ebooks
7
+ module NLP
8
+ # We don't necessarily want to use all of this stuff all the time
9
+ # Only load it when it is needed
10
+
11
+ def self.stopwords
12
+ @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
13
+ end
14
+
15
+ def self.nouns
16
+ @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
17
+ end
18
+
19
+ def self.adjectives
20
+ @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
21
+ end
22
+
23
+ def self.tokenizer
24
+ # This tokenizer is used for dividing sentences into words
25
+ # It's too slow for finding sentences in paragraphs, hence tactful
26
+ require 'tokenizer'
27
+ @tokenizer ||= Tokenizer::Tokenizer.new(:en)
28
+ end
29
+
30
+ def self.tactful
31
+ require 'tactful_tokenizer'
32
+ @tactful ||= TactfulTokenizer::Model.new
33
+ end
34
+
35
+ def self.tagger
36
+ require 'engtagger'
37
+ @tagger ||= EngTagger.new
38
+ end
39
+
40
+ def self.stemmer
41
+ require 'lingua/stemmer'
42
+ @stemmer ||= Lingua::Stemmer.new
43
+ end
44
+
45
+ def self.gingerice
46
+ require 'gingerice'
47
+ Gingerice::Parser.new # No caching for this one
48
+ end
49
+
50
+ def self.htmlentities
51
+ require 'htmlentities'
52
+ @htmlentities ||= HTMLEntities.new
53
+ end
54
+
55
+ ### Utility functions which wrap the above
56
+
57
+ def self.sentences(text)
58
+ tactful.tokenize_text(text)
59
+ end
60
+
61
+ def self.normalize(text)
62
+ htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
63
+ end
64
+
65
+ def self.tokenize(sentence)
66
+ # This is hacky, but an ad hoc approach seems to be
67
+ # most reliable for now. Tokenization libraries have oddities
68
+ # that are hard to correct.
69
+ sentence.split(/\s/).map do |token|
70
+ exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
71
+ if exceptions.find { |r| r.match(token) }
72
+ token
73
+ else
74
+ token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
75
+ end
76
+ end.flatten
77
+ end
78
+
79
+ def self.tokenset(sentence)
80
+ tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
81
+ tokens.map(&:downcase)
82
+ .reject { |token| stopwords.include?(token) }
83
+ .to_set
84
+ end
85
+
86
+ def self.space_between?(token1, token2)
87
+ p1 = self.punctuation?(token1)
88
+ p2 = self.punctuation?(token2)
89
+ if p1 && p2 # "foo?!"
90
+ false
91
+ elsif !p1 && p2 # "foo."
92
+ false
93
+ elsif p1 && !p2 # "foo. rah"
94
+ true
95
+ else # "foo rah"
96
+ true
97
+ end
98
+ end
99
+
100
+ def self.reconstruct(tokens)
101
+ # Put tokens back together into a nice looking sentence
102
+ text = ""
103
+ last_token = nil
104
+ tokens.each do |token|
105
+ text += ' ' if last_token && space_between?(last_token, token)
106
+ text += token
107
+ last_token = token
108
+ end
109
+ text
110
+ end
111
+
112
+ # Deliberately limit our punctuation handling to stuff we can do consistently
113
+ # It'll just be a part of a token if we don't split it out, and that's fine
114
+ PUNCTUATION = ".?!,"
115
+
116
+ def self.punctuation?(token)
117
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
118
+ end
119
+
120
+ def self.unmatched_enclosers?(text)
121
+ # Weird quotes are an instant giveaway. Let's do paren-matching.
122
+ enclosers = ['**', '""', '()', '[]', '``']
123
+ enclosers.each do |pair|
124
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
125
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
126
+
127
+ opened = 0
128
+
129
+ tokenize(text).each do |token|
130
+ opened += 1 if token.match(starter)
131
+ opened -= 1 if token.match(ender)
132
+
133
+ return true if opened < 0 # Too many ends!
134
+ end
135
+
136
+ return true if opened != 0 # Mismatch somewhere.
137
+ end
138
+
139
+ false
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,3 @@
1
+ module Ebooks
2
+ VERSION = "2.0.0"
3
+ end
@@ -0,0 +1,20 @@
1
+ gem 'minitest'
2
+
3
+ def log(*args)
4
+ STDERR.puts args.map(&:to_s).join(' ')
5
+ STDERR.flush
6
+ end
7
+
8
+ module Ebooks
9
+ GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
10
+ DATA_PATH = File.join(GEM_PATH, 'data')
11
+ SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
12
+ TEST_PATH = File.join(GEM_PATH, 'test')
13
+ TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
14
+ end
15
+
16
+ require 'twitter_ebooks/nlp'
17
+ require 'twitter_ebooks/archiver'
18
+ require 'twitter_ebooks/markov'
19
+ require 'twitter_ebooks/model'
20
+ require 'twitter_ebooks/bot'
data/skeleton/Procfile ADDED
@@ -0,0 +1 @@
1
+ worker: ruby bots.rb start
data/skeleton/bots.rb ADDED
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'twitter_ebooks'
4
+
5
+ # This is an example bot definition with event handlers commented out
6
+ # You can define as many of these as you like; they will run simultaneously
7
+
8
+ Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
9
+ # Consumer details come from registering an app at https://dev.twitter.com/
10
+ # OAuth details can be fetched with https://github.com/marcel/twurl
11
+ bot.consumer_key = "" # Your app consumer key
12
+ bot.consumer_secret = "" # Your app consumer secret
13
+ bot.oauth_token = "" # Token connecting the app to this account
14
+ bot.oauth_token_secret = "" # Secret connecting the app to this account
15
+
16
+ bot.on_message do |dm|
17
+ # Reply to a DM
18
+ # bot.reply(dm, "secret secrets")
19
+ end
20
+
21
+ bot.on_follow do |user|
22
+ # Follow a user back
23
+ # bot.follow(user[:screen_name])
24
+ end
25
+
26
+ bot.on_mention do |tweet, meta|
27
+ # Reply to a mention
28
+ # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
29
+ end
30
+
31
+ bot.on_timeline do |tweet, meta|
32
+ # Reply to a tweet in the bot's timeline
33
+ # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
34
+ end
35
+
36
+ bot.scheduler.every '24h' do
37
+ # Tweet something every 24 hours
38
+ # See https://github.com/jmettraux/rufus-scheduler
39
+ # bot.tweet("hi")
40
+ end
41
+ end
42
+
43
+ EM.run do
44
+ Ebooks::Bot.all.each do |bot|
45
+ bot.start
46
+ end
47
+ end
@@ -0,0 +1 @@
1
+ Put raw text files in here and process them with `ebooks consume` to make Markov models.
@@ -0,0 +1 @@
1
+ This is where the output of `ebooks consume <corpus_path>` goes. You can load these files using Model.load(path), and `ebooks gen <path>` for testing.