wikipedia_twitterbot 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/lib/wikipedia_twitterbot/article.rb +37 -6
- data/lib/wikipedia_twitterbot/article_text_cleaner.rb +29 -0
- data/lib/wikipedia_twitterbot/find_articles.rb +28 -0
- data/lib/wikipedia_twitterbot/find_images.rb +1 -2
- data/lib/wikipedia_twitterbot/rasterize.js +50 -0
- data/lib/wikipedia_twitterbot/tweet.rb +9 -4
- data/lib/wikipedia_twitterbot/twitter_client.rb +4 -0
- data/lib/wikipedia_twitterbot/version.rb +1 -1
- data/wikipedia_twitterbot.gemspec +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 35215d526e53248f263a20c08f6b737fdb09ea6df5b814543f9d6a55f19d69cd
|
4
|
+
data.tar.gz: fc3db6117288c4cb6531dc988278bfa39c1990ca598b7d583391c6607138ef7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fce11b334834aa0d8d3f296edd55082d2f3c32bf70dbabc0271eeb09a339862cfec4c95adf382707d5fd790d7224d18b4fb84e3c79709ae57aa42e395d64c578
|
7
|
+
data.tar.gz: aa7dc3aac490003622aad8680847af3e0ffd9500bba5dd8bd20ab28890b09a64c7cf17a047e8bcc11699bd0fd8ae83d03664edb43866d435553cd694c9456932
|
data/.gitignore
CHANGED
@@ -2,9 +2,11 @@ require 'active_record'
|
|
2
2
|
require 'activerecord-import'
|
3
3
|
require 'sqlite3'
|
4
4
|
require 'logger'
|
5
|
+
require 'fileutils'
|
5
6
|
require_relative 'tweet'
|
6
7
|
require_relative 'twitter_client'
|
7
8
|
require_relative 'find_images'
|
9
|
+
require_relative 'article_text_cleaner'
|
8
10
|
|
9
11
|
class Article < ActiveRecord::Base
|
10
12
|
class << self
|
@@ -88,11 +90,12 @@ class Article < ActiveRecord::Base
|
|
88
90
|
####################
|
89
91
|
# Instance methods #
|
90
92
|
####################
|
91
|
-
def tweet(tweet_text)
|
92
|
-
Tweet.new(tweet_text,
|
93
|
+
def tweet(tweet_text, opts = {})
|
94
|
+
@tweet_result = Tweet.new(tweet_text, opts).result
|
93
95
|
self.tweeted = true
|
94
96
|
save
|
95
|
-
'tweeted'
|
97
|
+
pp 'tweeted'
|
98
|
+
@tweet_result
|
96
99
|
rescue StandardError => e
|
97
100
|
self.failed_tweet_at = Time.now
|
98
101
|
save
|
@@ -100,6 +103,7 @@ class Article < ActiveRecord::Base
|
|
100
103
|
end
|
101
104
|
|
102
105
|
def screenshot_path
|
106
|
+
FileUtils.mkdir_p('screenshots') unless File.directory?('screenshots')
|
103
107
|
"screenshots/#{escaped_title}.png"
|
104
108
|
end
|
105
109
|
|
@@ -134,10 +138,16 @@ class Article < ActiveRecord::Base
|
|
134
138
|
"https://en.wikipedia.org/wiki/#{escaped_title}?veaction=edit&summary=%23#{bot_name}"
|
135
139
|
end
|
136
140
|
|
141
|
+
def dirp
|
142
|
+
pp RASTERIZE_PATH
|
143
|
+
end
|
144
|
+
|
145
|
+
RASTERIZE_PATH = "#{__dir__}/rasterize.js".freeze
|
137
146
|
def make_screenshot
|
138
|
-
|
139
|
-
|
140
|
-
|
147
|
+
# Use rasterize script to make a screenshot
|
148
|
+
%x[phantomjs #{RASTERIZE_PATH} #{mobile_url} #{screenshot_path} 1000px*1000px]
|
149
|
+
# Trim any extra blank space, which may or may not be present.
|
150
|
+
%x[convert #{screenshot_path} -trim #{screenshot_path}]
|
141
151
|
end
|
142
152
|
|
143
153
|
def hashtag
|
@@ -148,5 +158,26 @@ class Article < ActiveRecord::Base
|
|
148
158
|
self.class.bot_name
|
149
159
|
end
|
150
160
|
|
161
|
+
def wikilinks
|
162
|
+
return @links if @links.present?
|
163
|
+
query = { prop: 'links', titles: title, plnamespace: '0', pllimit: 500 }
|
164
|
+
response = Wiki.query query
|
165
|
+
@links = response.data['pages'].values.first['links'].map { |link| link['title'] }
|
166
|
+
@links
|
167
|
+
end
|
168
|
+
|
169
|
+
def page_text
|
170
|
+
@page_text ||= Wiki.get_page_content title
|
171
|
+
end
|
172
|
+
|
173
|
+
def plaintext
|
174
|
+
@plaintext = ArticleTextCleaner.convert(page_text)
|
175
|
+
end
|
176
|
+
|
177
|
+
def sentence_with(text)
|
178
|
+
# TODO: Remove the plaintext footnote remnants
|
179
|
+
plaintext[/[^.?!\n]*#{Regexp.quote text}[^.?!]*[.?!]/i]
|
180
|
+
end
|
181
|
+
|
151
182
|
class NoImageError < StandardError; end
|
152
183
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'pandoc-ruby'
|
2
|
+
|
3
|
+
class ArticleTextCleaner
|
4
|
+
def self.convert(page_text)
|
5
|
+
new(page_text).convert
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize(page_text)
|
9
|
+
@page_text = page_text
|
10
|
+
end
|
11
|
+
|
12
|
+
def convert
|
13
|
+
@output = PandocRuby.new(@page_text, from: :mediawiki, to: :plain).convert
|
14
|
+
remove_refs
|
15
|
+
replace_single_linebreaks
|
16
|
+
@output
|
17
|
+
end
|
18
|
+
|
19
|
+
# Refs in up in plaintext as: [12]
|
20
|
+
def remove_refs
|
21
|
+
@output.gsub!(/\[\d+\]/, '')
|
22
|
+
end
|
23
|
+
|
24
|
+
# Linebreaks just for line wrapping appear where spaces should be.
|
25
|
+
# Double line breaks happen between paragraphs; leave those in place.
|
26
|
+
def replace_single_linebreaks
|
27
|
+
@output.gsub!(/(?<!\n)\n(?!\n)/, ' ')
|
28
|
+
end
|
29
|
+
end
|
@@ -30,10 +30,38 @@ class FindArticles
|
|
30
30
|
by_ids(ids)
|
31
31
|
end
|
32
32
|
|
33
|
+
def self.by_title(title)
|
34
|
+
existing = Article.find_by(title: title)
|
35
|
+
return existing if existing.present?
|
36
|
+
page_data = Wiki.query title_info_query(title)
|
37
|
+
article_data = page_data.data['pages'].values.first
|
38
|
+
article = Article.new(id: article_data['pageid'],
|
39
|
+
title: article_data['title'],
|
40
|
+
latest_revision: article_data['lastrevid'],
|
41
|
+
latest_revision_datetime: article_data['touched'])
|
42
|
+
|
43
|
+
return article unless article_data['redirect']
|
44
|
+
|
45
|
+
# If it's a redirect, return the redirect target instead.
|
46
|
+
redirect_target = article.wikilinks.first
|
47
|
+
return by_title(redirect_target)
|
48
|
+
end
|
49
|
+
|
33
50
|
####################
|
34
51
|
# Internal methods #
|
35
52
|
####################
|
36
53
|
|
54
|
+
def self.title_revisions_query(title)
|
55
|
+
{ prop: 'revisions',
|
56
|
+
titles: title,
|
57
|
+
rvprop: 'userid|ids|timestamp' }
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.title_info_query(title)
|
61
|
+
{ prop: 'info',
|
62
|
+
titles: title }
|
63
|
+
end
|
64
|
+
|
37
65
|
def self.revisions_query(article_ids)
|
38
66
|
{ prop: 'revisions',
|
39
67
|
pageids: article_ids,
|
@@ -0,0 +1,50 @@
|
|
1
|
+
// Adapted slightly from https://github.com/ariya/phantomjs/blob/master/examples/rasterize.js
|
2
|
+
// License: 3-clause BSD https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD
|
3
|
+
"use strict";
|
4
|
+
var page = require('webpage').create(),
|
5
|
+
system = require('system'),
|
6
|
+
address, output, size, pageWidth, pageHeight;
|
7
|
+
|
8
|
+
if (system.args.length < 3 || system.args.length > 5) {
|
9
|
+
console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
|
10
|
+
console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
|
11
|
+
console.log(' image (png/jpg output) examples: "1920px" entire page, window width 1920px');
|
12
|
+
console.log(' "800px*600px" window, clipped to 800x600');
|
13
|
+
phantom.exit(1);
|
14
|
+
} else {
|
15
|
+
address = system.args[1];
|
16
|
+
output = system.args[2];
|
17
|
+
page.viewportSize = { width: 600, height: 600 };
|
18
|
+
if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
|
19
|
+
size = system.args[3].split('*');
|
20
|
+
page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
|
21
|
+
: { format: system.args[3], orientation: 'portrait', margin: '1cm' };
|
22
|
+
} else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
|
23
|
+
size = system.args[3].split('*');
|
24
|
+
if (size.length === 2) {
|
25
|
+
var pageWidth = parseInt(size[0], 10),
|
26
|
+
pageHeight = parseInt(size[1], 10);
|
27
|
+
page.viewportSize = { width: pageWidth, height: pageHeight };
|
28
|
+
page.clipRect = { top: 0, left: 0, width: pageWidth + 20, height: pageHeight };
|
29
|
+
} else {
|
30
|
+
console.log("size:", system.args[3]);
|
31
|
+
var pageWidth = parseInt(system.args[3], 10),
|
32
|
+
pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
|
33
|
+
console.log ("pageHeight:",pageHeight);
|
34
|
+
}
|
35
|
+
}
|
36
|
+
if (system.args.length > 4) {
|
37
|
+
page.zoomFactor = system.args[4];
|
38
|
+
}
|
39
|
+
page.open(address, function (status) {
|
40
|
+
if (status !== 'success') {
|
41
|
+
console.log('Unable to load the address!');
|
42
|
+
phantom.exit(1);
|
43
|
+
} else {
|
44
|
+
window.setTimeout(function () {
|
45
|
+
page.render(output);
|
46
|
+
phantom.exit();
|
47
|
+
}, 200);
|
48
|
+
}
|
49
|
+
});
|
50
|
+
}
|
@@ -2,6 +2,7 @@ require 'twitter'
|
|
2
2
|
|
3
3
|
# Finds tweetable articles, tweets them
|
4
4
|
class Tweet
|
5
|
+
attr_reader :result
|
5
6
|
# Find an article to tweet and tweet it
|
6
7
|
def self.anything
|
7
8
|
# Randomly tweet either the earlier tweetable Article in the database
|
@@ -20,13 +21,17 @@ class Tweet
|
|
20
21
|
###############
|
21
22
|
# Twitter API #
|
22
23
|
###############
|
23
|
-
def initialize(tweet,
|
24
|
-
if
|
24
|
+
def initialize(tweet, opts = {})
|
25
|
+
if opts[:commons_image]
|
26
|
+
filename = opts.delete(:commons_image)
|
25
27
|
Wiki.save_commons_image filename
|
26
|
-
TwitterClient.new.client.update_with_media(tweet, File.new(filename))
|
28
|
+
@result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
|
27
29
|
File.delete filename
|
30
|
+
elsif opts[:filename]
|
31
|
+
filename = opts.delete(:filename)
|
32
|
+
@result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
|
28
33
|
else
|
29
|
-
TwitterClient.new.client.update(tweet)
|
34
|
+
@result = TwitterClient.new.client.update(tweet, opts)
|
30
35
|
end
|
31
36
|
end
|
32
37
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikipedia_twitterbot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sage Ross
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pandoc-ruby
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
description:
|
126
140
|
email:
|
127
141
|
- sage@ragesoss.com
|
@@ -139,6 +153,7 @@ files:
|
|
139
153
|
- bin/setup
|
140
154
|
- lib/wikipedia_twitterbot.rb
|
141
155
|
- lib/wikipedia_twitterbot/article.rb
|
156
|
+
- lib/wikipedia_twitterbot/article_text_cleaner.rb
|
142
157
|
- lib/wikipedia_twitterbot/category_filter.rb
|
143
158
|
- lib/wikipedia_twitterbot/db/001_create_articles.rb
|
144
159
|
- lib/wikipedia_twitterbot/db/bootstrap.rb
|
@@ -147,6 +162,7 @@ files:
|
|
147
162
|
- lib/wikipedia_twitterbot/find_images.rb
|
148
163
|
- lib/wikipedia_twitterbot/high_pageviews.rb
|
149
164
|
- lib/wikipedia_twitterbot/ores.rb
|
165
|
+
- lib/wikipedia_twitterbot/rasterize.js
|
150
166
|
- lib/wikipedia_twitterbot/tweet.rb
|
151
167
|
- lib/wikipedia_twitterbot/twitter_client.rb
|
152
168
|
- lib/wikipedia_twitterbot/version.rb
|
@@ -173,7 +189,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
173
189
|
version: '0'
|
174
190
|
requirements: []
|
175
191
|
rubyforge_project:
|
176
|
-
rubygems_version: 2.
|
192
|
+
rubygems_version: 2.7.3
|
177
193
|
signing_key:
|
178
194
|
specification_version: 4
|
179
195
|
summary: Tools for building Wikipedia-focused Twitter bots
|