twitter_ebooks 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+ require 'twitter'
3
+ require 'tweetstream'
4
+ require 'rufus/scheduler'
5
+
6
+ module Ebooks
7
+ class Bot
8
+ attr_accessor :consumer_key, :consumer_secret,
9
+ :oauth_token, :oauth_token_secret
10
+
11
+ attr_accessor :username
12
+
13
+ attr_reader :twitter, :stream
14
+
15
+ @@all = [] # List of all defined bots
16
+ def self.all; @@all; end
17
+
18
+ def initialize(username, &b)
19
+ # Set defaults
20
+ @username = username
21
+
22
+ # Override with callback
23
+ b.call(self)
24
+
25
+ Bot.all.push(self)
26
+ end
27
+
28
+ def log(*args)
29
+ STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
30
+ STDERR.flush
31
+ end
32
+
33
+ # Connects to tweetstream and opens event handlers for this bot
34
+ def start
35
+ TweetStream.configure do |config|
36
+ config.consumer_key = @consumer_key
37
+ config.consumer_secret = @consumer_secret
38
+ config.oauth_token = @oauth_token
39
+ config.oauth_token_secret = @oauth_token_secret
40
+ end
41
+
42
+ Twitter.configure do |config|
43
+ config.consumer_key = @consumer_key
44
+ config.consumer_secret = @consumer_secret
45
+ config.oauth_token = @oauth_token
46
+ config.oauth_token_secret = @oauth_token_secret
47
+ end
48
+
49
+ @twitter = Twitter::Client.new
50
+ @stream = TweetStream::Client.new
51
+
52
+ @stream.on_error do |msg|
53
+ log "ERROR: #{msg}"
54
+ end
55
+
56
+ @stream.on_inited do
57
+ log "Online!"
58
+ end
59
+
60
+ @stream.on_event(:follow) do |event|
61
+ log "Followed by #{event[:source][:screen_name]}"
62
+ @on_follow.call(event[:source])
63
+ end
64
+
65
+ @stream.on_direct_message do |dm|
66
+ next if dm[:sender][:screen_name] == @username # Don't reply to self
67
+ log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
68
+ @on_message.call(dm)
69
+ end
70
+
71
+ @stream.userstream do |ev|
72
+ next unless ev[:text] # If it's not a text-containing tweet, ignore it
73
+ next if ev[:user][:screen_name] == @username # Ignore our own tweets
74
+
75
+ meta = {}
76
+ mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
77
+
78
+ reply_mentions = mentions.reject { |m| m.downcase == @username }
79
+ reply_mentions << ev[:user][:screen_name]
80
+
81
+ meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
82
+
83
+ mless = ev[:text]
84
+ ev.attrs[:entities][:user_mentions].reverse.each do |entity|
85
+ mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
86
+ end
87
+ meta[:mentionless] = mless
88
+
89
+ # To check if this is a mention, ensure:
90
+ # - The tweet mentions list contains our username
91
+ # - The tweet is not being retweeted by somebody else
92
+ # - Or soft-retweeted by somebody else
93
+ if mentions.map(&:downcase).include?(@username) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
94
+ log "Mention from #{ev[:user][:screen_name]}: #{ev[:text]}"
95
+ @on_mention.call(ev, meta)
96
+ else
97
+ @on_timeline.call(ev, meta)
98
+ end
99
+ end
100
+ end
101
+
102
+ # Wrapper for EM.add_timer
103
+ # Delays add a greater sense of humanity to bot behaviour
104
+ def delay(time, &b)
105
+ time = time.to_a.sample unless time.is_a? Integer
106
+ EM.add_timer(time, &b)
107
+ end
108
+
109
+ # Reply to a tweet or a DM.
110
+ # Applies configurable @reply_delay range
111
+ def reply(ev, text, opts={})
112
+ opts = opts.clone
113
+ delay = @reply_delay.to_a.sample
114
+
115
+ if ev.is_a? Twitter::DirectMessage
116
+ log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
117
+ @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
118
+ elsif ev.is_a? Twitter::Tweet
119
+ log "Replying to @#{ev[:user][:screen_name]}: #{text}"
120
+ @twitter.update(text, in_reply_to_status_id: ev[:id])
121
+ else
122
+ raise Exception("Don't know how to reply to a #{ev.class}")
123
+ end
124
+ end
125
+
126
+ def scheduler
127
+ @scheduler ||= Rufus::Scheduler.new
128
+ end
129
+
130
+ def follow(*args)
131
+ log "Following #{args}"
132
+ @twitter.follow(*args)
133
+ end
134
+
135
+ def tweet(*args)
136
+ log "Tweeting #{args.inspect}"
137
+ @twitter.update(*args)
138
+ end
139
+
140
+ def on_follow(&b); @on_follow = b; end
141
+ def on_mention(&b); @on_mention = b; end
142
+ def on_timeline(&b); @on_timeline = b; end
143
+ def on_message(&b); @on_message = b; end
144
+ end
145
+ end
@@ -0,0 +1,89 @@
1
+ module Ebooks
2
+ class MarkovModel
3
+ INTERIM = :interim # Special token marking newline/^/$ boundaries
4
+
5
+ attr_accessor :tokens
6
+ attr_reader :depth
7
+
8
+ def represent(token)
9
+ if token.nil? || token == "\n" || token.empty?
10
+ INTERIM
11
+ else
12
+ token
13
+ end
14
+ end
15
+
16
+ def consume(tokenized, depth=2)
17
+ @tokens = [INTERIM]
18
+ @depth = depth
19
+
20
+ tokenized.each do |tokens|
21
+ @tokens += tokens
22
+ @tokens << INTERIM
23
+ end
24
+
25
+ @model = {}
26
+
27
+ @tokens.each_with_index do |token, i|
28
+ prev_tokens = []
29
+
30
+ @depth.downto(1) do |j|
31
+ if i-j < 0; next
32
+ else; prev = represent(@tokens[i-j])
33
+ end
34
+ prev_tokens << prev
35
+ end
36
+
37
+ 1.upto(@depth) do |j|
38
+ break if j > prev_tokens.length
39
+ ngram = prev_tokens.last(j)
40
+
41
+ unless ngram == INTERIM && prev_tokens[-1] == INTERIM
42
+ @model[ngram] ||= []
43
+ @model[ngram] << represent(token)
44
+ end
45
+ end
46
+ end
47
+
48
+ self
49
+ end
50
+
51
+ def chain(tokens)
52
+ next_token = nil
53
+ @depth.downto(1).each do |i|
54
+ next if tokens.length < i
55
+ matches = @model[tokens.last(i)]
56
+ if matches
57
+ #p tokens.last(i)
58
+ #puts "=> #{matches.inspect}"
59
+ next_token = matches.sample
60
+ break
61
+ end
62
+ end
63
+
64
+ raise ArgumentError if next_token.nil?
65
+
66
+ if next_token == INTERIM
67
+ return tokens
68
+ else
69
+ return chain(tokens + [next_token])
70
+ end
71
+ end
72
+
73
+ def generate
74
+ tokens = chain([@model[[INTERIM]].sample])
75
+ NLP.reconstruct(tokens)
76
+ end
77
+
78
+ def serialize
79
+ { 'model' => @model,
80
+ 'depth' => @depth }
81
+ end
82
+
83
+ def deserialize(data)
84
+ @model = data['model']
85
+ @depth = data['depth']
86
+ self
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,147 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'json'
5
+ require 'set'
6
+ require 'digest/md5'
7
+
8
+ module Ebooks
9
+ class Model
10
+ attr_accessor :hash, :sentences, :tokenized, :markov
11
+
12
+ def self.consume(txtpath)
13
+ Model.new.consume(txtpath)
14
+ end
15
+
16
+ def self.load(path)
17
+ data = Marshal.load(File.read(path))
18
+ Model.new.deserialize(data)
19
+ end
20
+
21
+ def consume(txtpath)
22
+ # Record hash of source file so we know to update later
23
+ @hash = Digest::MD5.hexdigest(File.read(txtpath))
24
+
25
+ text = File.read(txtpath)
26
+ log "Removing commented lines and mentions"
27
+
28
+ lines = text.split("\n")
29
+ keeping = []
30
+ lines.each do |l|
31
+ next if l.start_with?('#') || l.include?('RT')
32
+ processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
33
+ keeping << processed.join(' ')
34
+ end
35
+ text = NLP.normalize(keeping.join("\n"))
36
+
37
+ log "Segmenting text into sentences of 140 characters or less"
38
+ @sentences = NLP.sentences(text).reject do |s|
39
+ s.length > 140 || s.count('"')%2 != 0
40
+ end
41
+
42
+ log "Tokenizing #{@sentences.length} sentences"
43
+ @tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
44
+ @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
45
+
46
+ log "Building markov model (this may take a while)"
47
+ @markov = MarkovModel.new.consume(@tokenized)
48
+
49
+ self
50
+ end
51
+
52
+ # Produces a hash with the data needed to quickly
53
+ # reconstruct this corpus object
54
+ def serialize
55
+ return { 'hash' => @hash,
56
+ 'tokenized' => @tokenized,
57
+ 'tokensets' => @tokensets,
58
+ 'markov' => @markov.serialize }
59
+ end
60
+
61
+ def save(path)
62
+ data = self.serialize
63
+ File.open(path, 'w') do |f|
64
+ f.write(Marshal.dump(data))
65
+ end
66
+ self
67
+ end
68
+
69
+ def deserialize(data)
70
+ @hash = data['hash']
71
+ @tokenized = data['tokenized']
72
+ @tokensets = data['tokensets']
73
+ @markov = MarkovModel.new.deserialize(data['markov'])
74
+ self
75
+ end
76
+
77
+ def replace_noun(sent)
78
+ tagged = NLP.tagger.add_tags(sent)
79
+
80
+ nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
81
+ to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
82
+ return sent if to_replace.nil?
83
+ replacement = NLP.nouns.sample
84
+ if to_replace.en.plural.length <= to_replace.length
85
+ replacement = replacement.en.plural(1)
86
+ end
87
+ sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
88
+ sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
89
+ end
90
+
91
+ def fix(tweet)
92
+ # This seems to require an external api call
93
+ begin
94
+ fixer = NLP.gingerice.parse(tweet)
95
+ log fixer if fixer['corrections']
96
+ tweet = fixer['result']
97
+ rescue Exception => e
98
+ log e.message
99
+ log e.backtrace
100
+ end
101
+
102
+ NLP.htmlentities.decode tweet
103
+ end
104
+
105
+ def markov_statement(limit=140, markov=nil)
106
+ markov ||= @markov
107
+ tweet = ""
108
+
109
+ while (tweet = markov.generate) do
110
+ next if tweet.length > limit
111
+ next if NLP.unmatched_enclosers?(tweet)
112
+ break if tweet.length > limit*0.4 || rand > 0.8
113
+ end
114
+
115
+ fix tweet
116
+ end
117
+
118
+ # Generates a response by looking for related sentences
119
+ # in the corpus and building a smaller markov model from these
120
+ def markov_response(input, limit=140)
121
+ inputset = NLP.tokenset(input)
122
+ log "Input tokenset: #{inputset.to_a}"
123
+
124
+ if inputset.empty?
125
+ # Very uninteresting input; no relevant response possible
126
+ return markov_statement(limit)
127
+ end
128
+
129
+ # Let's find all the sentences that might be relevant
130
+ relevant = []
131
+ @tokensets.each_with_index.map do |set, i|
132
+ if inputset.intersection(set).length > 0
133
+ relevant << @tokenized[i]
134
+ end
135
+ end
136
+
137
+ log "Found #{relevant.length} relevant tokenset matches"
138
+
139
+ if relevant.length < 3
140
+ return markov_statement(limit)
141
+ end
142
+
143
+ markov = MarkovModel.new.consume(relevant.sample(100))
144
+ markov_statement(limit, markov)
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+
3
+ require 'linguistics'
4
+ Linguistics.use(:en, classes: [String])
5
+
6
+ module Ebooks
7
+ module NLP
8
+ # We don't necessarily want to use all of this stuff all the time
9
+ # Only load it when it is needed
10
+
11
+ def self.stopwords
12
+ @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
13
+ end
14
+
15
+ def self.nouns
16
+ @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
17
+ end
18
+
19
+ def self.adjectives
20
+ @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
21
+ end
22
+
23
+ def self.tokenizer
24
+ # This tokenizer is used for dividing sentences into words
25
+ # It's too slow for finding sentences in paragraphs, hence tactful
26
+ require 'tokenizer'
27
+ @tokenizer ||= Tokenizer::Tokenizer.new(:en)
28
+ end
29
+
30
+ def self.tactful
31
+ require 'tactful_tokenizer'
32
+ @tactful ||= TactfulTokenizer::Model.new
33
+ end
34
+
35
+ def self.tagger
36
+ require 'engtagger'
37
+ @tagger ||= EngTagger.new
38
+ end
39
+
40
+ def self.stemmer
41
+ require 'lingua/stemmer'
42
+ @stemmer ||= Lingua::Stemmer.new
43
+ end
44
+
45
+ def self.gingerice
46
+ require 'gingerice'
47
+ Gingerice::Parser.new # No caching for this one
48
+ end
49
+
50
+ def self.htmlentities
51
+ require 'htmlentities'
52
+ @htmlentities ||= HTMLEntities.new
53
+ end
54
+
55
+ ### Utility functions which wrap the above
56
+
57
+ def self.sentences(text)
58
+ tactful.tokenize_text(text)
59
+ end
60
+
61
+ def self.normalize(text)
62
+ htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
63
+ end
64
+
65
+ def self.tokenize(sentence)
66
+ # This is hacky, but an ad hoc approach seems to be
67
+ # most reliable for now. Tokenization libraries have oddities
68
+ # that are hard to correct.
69
+ sentence.split(/\s/).map do |token|
70
+ exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
71
+ if exceptions.find { |r| r.match(token) }
72
+ token
73
+ else
74
+ token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
75
+ end
76
+ end.flatten
77
+ end
78
+
79
+ def self.tokenset(sentence)
80
+ tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
81
+ tokens.map(&:downcase)
82
+ .reject { |token| stopwords.include?(token) }
83
+ .to_set
84
+ end
85
+
86
+ def self.space_between?(token1, token2)
87
+ p1 = self.punctuation?(token1)
88
+ p2 = self.punctuation?(token2)
89
+ if p1 && p2 # "foo?!"
90
+ false
91
+ elsif !p1 && p2 # "foo."
92
+ false
93
+ elsif p1 && !p2 # "foo. rah"
94
+ true
95
+ else # "foo rah"
96
+ true
97
+ end
98
+ end
99
+
100
+ def self.reconstruct(tokens)
101
+ # Put tokens back together into a nice looking sentence
102
+ text = ""
103
+ last_token = nil
104
+ tokens.each do |token|
105
+ text += ' ' if last_token && space_between?(last_token, token)
106
+ text += token
107
+ last_token = token
108
+ end
109
+ text
110
+ end
111
+
112
+ # Deliberately limit our punctuation handling to stuff we can do consistently
113
+ # It'll just be a part of a token if we don't split it out, and that's fine
114
+ PUNCTUATION = ".?!,"
115
+
116
+ def self.punctuation?(token)
117
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
118
+ end
119
+
120
+ def self.unmatched_enclosers?(text)
121
+ # Weird quotes are an instant giveaway. Let's do paren-matching.
122
+ enclosers = ['**', '""', '()', '[]', '``']
123
+ enclosers.each do |pair|
124
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
125
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
126
+
127
+ opened = 0
128
+
129
+ tokenize(text).each do |token|
130
+ opened += 1 if token.match(starter)
131
+ opened -= 1 if token.match(ender)
132
+
133
+ return true if opened < 0 # Too many ends!
134
+ end
135
+
136
+ return true if opened != 0 # Mismatch somewhere.
137
+ end
138
+
139
+ false
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,3 @@
1
+ module Ebooks
2
+ VERSION = "2.0.0"
3
+ end
@@ -0,0 +1,20 @@
1
+ gem 'minitest'
2
+
3
+ def log(*args)
4
+ STDERR.puts args.map(&:to_s).join(' ')
5
+ STDERR.flush
6
+ end
7
+
8
+ module Ebooks
9
+ GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
10
+ DATA_PATH = File.join(GEM_PATH, 'data')
11
+ SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
12
+ TEST_PATH = File.join(GEM_PATH, 'test')
13
+ TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
14
+ end
15
+
16
+ require 'twitter_ebooks/nlp'
17
+ require 'twitter_ebooks/archiver'
18
+ require 'twitter_ebooks/markov'
19
+ require 'twitter_ebooks/model'
20
+ require 'twitter_ebooks/bot'
data/skeleton/Procfile ADDED
@@ -0,0 +1 @@
1
+ worker: ruby bots.rb start
data/skeleton/bots.rb ADDED
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'twitter_ebooks'
4
+
5
+ # This is an example bot definition with event handlers commented out
6
+ # You can define as many of these as you like; they will run simultaneously
7
+
8
+ Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
9
+ # Consumer details come from registering an app at https://dev.twitter.com/
10
+ # OAuth details can be fetched with https://github.com/marcel/twurl
11
+ bot.consumer_key = "" # Your app consumer key
12
+ bot.consumer_secret = "" # Your app consumer secret
13
+ bot.oauth_token = "" # Token connecting the app to this account
14
+ bot.oauth_token_secret = "" # Secret connecting the app to this account
15
+
16
+ bot.on_message do |dm|
17
+ # Reply to a DM
18
+ # bot.reply(dm, "secret secrets")
19
+ end
20
+
21
+ bot.on_follow do |user|
22
+ # Follow a user back
23
+ # bot.follow(user[:screen_name])
24
+ end
25
+
26
+ bot.on_mention do |tweet, meta|
27
+ # Reply to a mention
28
+ # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
29
+ end
30
+
31
+ bot.on_timeline do |tweet, meta|
32
+ # Reply to a tweet in the bot's timeline
33
+ # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
34
+ end
35
+
36
+ bot.scheduler.every '24h' do
37
+ # Tweet something every 24 hours
38
+ # See https://github.com/jmettraux/rufus-scheduler
39
+ # bot.tweet("hi")
40
+ end
41
+ end
42
+
43
+ EM.run do
44
+ Ebooks::Bot.all.each do |bot|
45
+ bot.start
46
+ end
47
+ end
@@ -0,0 +1 @@
1
+ Put raw text files in here and process them with `ebooks consume` to make Markov models.
@@ -0,0 +1 @@
1
+ This is where the output of `ebooks consume <corpus_path>` goes. You can load these files using Model.load(path), and `ebooks gen <path>` for testing.