wikipedia_twitterbot 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/lib/wikipedia_twitterbot/article.rb +37 -6
- data/lib/wikipedia_twitterbot/article_text_cleaner.rb +29 -0
- data/lib/wikipedia_twitterbot/find_articles.rb +28 -0
- data/lib/wikipedia_twitterbot/find_images.rb +1 -2
- data/lib/wikipedia_twitterbot/rasterize.js +50 -0
- data/lib/wikipedia_twitterbot/tweet.rb +9 -4
- data/lib/wikipedia_twitterbot/twitter_client.rb +4 -0
- data/lib/wikipedia_twitterbot/version.rb +1 -1
- data/wikipedia_twitterbot.gemspec +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 35215d526e53248f263a20c08f6b737fdb09ea6df5b814543f9d6a55f19d69cd
|
4
|
+
data.tar.gz: fc3db6117288c4cb6531dc988278bfa39c1990ca598b7d583391c6607138ef7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fce11b334834aa0d8d3f296edd55082d2f3c32bf70dbabc0271eeb09a339862cfec4c95adf382707d5fd790d7224d18b4fb84e3c79709ae57aa42e395d64c578
|
7
|
+
data.tar.gz: aa7dc3aac490003622aad8680847af3e0ffd9500bba5dd8bd20ab28890b09a64c7cf17a047e8bcc11699bd0fd8ae83d03664edb43866d435553cd694c9456932
|
data/.gitignore
CHANGED
@@ -2,9 +2,11 @@ require 'active_record'
|
|
2
2
|
require 'activerecord-import'
|
3
3
|
require 'sqlite3'
|
4
4
|
require 'logger'
|
5
|
+
require 'fileutils'
|
5
6
|
require_relative 'tweet'
|
6
7
|
require_relative 'twitter_client'
|
7
8
|
require_relative 'find_images'
|
9
|
+
require_relative 'article_text_cleaner'
|
8
10
|
|
9
11
|
class Article < ActiveRecord::Base
|
10
12
|
class << self
|
@@ -88,11 +90,12 @@ class Article < ActiveRecord::Base
|
|
88
90
|
####################
|
89
91
|
# Instance methods #
|
90
92
|
####################
|
91
|
-
def tweet(tweet_text)
|
92
|
-
Tweet.new(tweet_text,
|
93
|
+
def tweet(tweet_text, opts = {})
|
94
|
+
@tweet_result = Tweet.new(tweet_text, opts).result
|
93
95
|
self.tweeted = true
|
94
96
|
save
|
95
|
-
'tweeted'
|
97
|
+
pp 'tweeted'
|
98
|
+
@tweet_result
|
96
99
|
rescue StandardError => e
|
97
100
|
self.failed_tweet_at = Time.now
|
98
101
|
save
|
@@ -100,6 +103,7 @@ class Article < ActiveRecord::Base
|
|
100
103
|
end
|
101
104
|
|
102
105
|
def screenshot_path
|
106
|
+
FileUtils.mkdir_p('screenshots') unless File.directory?('screenshots')
|
103
107
|
"screenshots/#{escaped_title}.png"
|
104
108
|
end
|
105
109
|
|
@@ -134,10 +138,16 @@ class Article < ActiveRecord::Base
|
|
134
138
|
"https://en.wikipedia.org/wiki/#{escaped_title}?veaction=edit&summary=%23#{bot_name}"
|
135
139
|
end
|
136
140
|
|
141
|
+
def dirp
|
142
|
+
pp RASTERIZE_PATH
|
143
|
+
end
|
144
|
+
|
145
|
+
RASTERIZE_PATH = "#{__dir__}/rasterize.js".freeze
|
137
146
|
def make_screenshot
|
138
|
-
|
139
|
-
|
140
|
-
|
147
|
+
# Use rasterize script to make a screenshot
|
148
|
+
%x[phantomjs #{RASTERIZE_PATH} #{mobile_url} #{screenshot_path} 1000px*1000px]
|
149
|
+
# Trim any extra blank space, which may or may not be present.
|
150
|
+
%x[convert #{screenshot_path} -trim #{screenshot_path}]
|
141
151
|
end
|
142
152
|
|
143
153
|
def hashtag
|
@@ -148,5 +158,26 @@ class Article < ActiveRecord::Base
|
|
148
158
|
self.class.bot_name
|
149
159
|
end
|
150
160
|
|
161
|
+
def wikilinks
|
162
|
+
return @links if @links.present?
|
163
|
+
query = { prop: 'links', titles: title, plnamespace: '0', pllimit: 500 }
|
164
|
+
response = Wiki.query query
|
165
|
+
@links = response.data['pages'].values.first['links'].map { |link| link['title'] }
|
166
|
+
@links
|
167
|
+
end
|
168
|
+
|
169
|
+
def page_text
|
170
|
+
@page_text ||= Wiki.get_page_content title
|
171
|
+
end
|
172
|
+
|
173
|
+
def plaintext
|
174
|
+
@plaintext = ArticleTextCleaner.convert(page_text)
|
175
|
+
end
|
176
|
+
|
177
|
+
def sentence_with(text)
|
178
|
+
# TODO: Remove the plaintext footnote remnants
|
179
|
+
plaintext[/[^.?!\n]*#{Regexp.quote text}[^.?!]*[.?!]/i]
|
180
|
+
end
|
181
|
+
|
151
182
|
class NoImageError < StandardError; end
|
152
183
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'pandoc-ruby'
|
2
|
+
|
3
|
+
class ArticleTextCleaner
|
4
|
+
def self.convert(page_text)
|
5
|
+
new(page_text).convert
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize(page_text)
|
9
|
+
@page_text = page_text
|
10
|
+
end
|
11
|
+
|
12
|
+
def convert
|
13
|
+
@output = PandocRuby.new(@page_text, from: :mediawiki, to: :plain).convert
|
14
|
+
remove_refs
|
15
|
+
replace_single_linebreaks
|
16
|
+
@output
|
17
|
+
end
|
18
|
+
|
19
|
+
# Refs in up in plaintext as: [12]
|
20
|
+
def remove_refs
|
21
|
+
@output.gsub!(/\[\d+\]/, '')
|
22
|
+
end
|
23
|
+
|
24
|
+
# Linebreaks just for line wrapping appear where spaces should be.
|
25
|
+
# Double line breaks happen between paragraphs; leave those in place.
|
26
|
+
def replace_single_linebreaks
|
27
|
+
@output.gsub!(/(?<!\n)\n(?!\n)/, ' ')
|
28
|
+
end
|
29
|
+
end
|
@@ -30,10 +30,38 @@ class FindArticles
|
|
30
30
|
by_ids(ids)
|
31
31
|
end
|
32
32
|
|
33
|
+
def self.by_title(title)
|
34
|
+
existing = Article.find_by(title: title)
|
35
|
+
return existing if existing.present?
|
36
|
+
page_data = Wiki.query title_info_query(title)
|
37
|
+
article_data = page_data.data['pages'].values.first
|
38
|
+
article = Article.new(id: article_data['pageid'],
|
39
|
+
title: article_data['title'],
|
40
|
+
latest_revision: article_data['lastrevid'],
|
41
|
+
latest_revision_datetime: article_data['touched'])
|
42
|
+
|
43
|
+
return article unless article_data['redirect']
|
44
|
+
|
45
|
+
# If it's a redirect, return the redirect target instead.
|
46
|
+
redirect_target = article.wikilinks.first
|
47
|
+
return by_title(redirect_target)
|
48
|
+
end
|
49
|
+
|
33
50
|
####################
|
34
51
|
# Internal methods #
|
35
52
|
####################
|
36
53
|
|
54
|
+
def self.title_revisions_query(title)
|
55
|
+
{ prop: 'revisions',
|
56
|
+
titles: title,
|
57
|
+
rvprop: 'userid|ids|timestamp' }
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.title_info_query(title)
|
61
|
+
{ prop: 'info',
|
62
|
+
titles: title }
|
63
|
+
end
|
64
|
+
|
37
65
|
def self.revisions_query(article_ids)
|
38
66
|
{ prop: 'revisions',
|
39
67
|
pageids: article_ids,
|
@@ -0,0 +1,50 @@
|
|
1
|
+
// Adapted slightly from https://github.com/ariya/phantomjs/blob/master/examples/rasterize.js
|
2
|
+
// License: 3-clause BSD https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD
|
3
|
+
"use strict";
|
4
|
+
var page = require('webpage').create(),
|
5
|
+
system = require('system'),
|
6
|
+
address, output, size, pageWidth, pageHeight;
|
7
|
+
|
8
|
+
if (system.args.length < 3 || system.args.length > 5) {
|
9
|
+
console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
|
10
|
+
console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
|
11
|
+
console.log(' image (png/jpg output) examples: "1920px" entire page, window width 1920px');
|
12
|
+
console.log(' "800px*600px" window, clipped to 800x600');
|
13
|
+
phantom.exit(1);
|
14
|
+
} else {
|
15
|
+
address = system.args[1];
|
16
|
+
output = system.args[2];
|
17
|
+
page.viewportSize = { width: 600, height: 600 };
|
18
|
+
if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
|
19
|
+
size = system.args[3].split('*');
|
20
|
+
page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
|
21
|
+
: { format: system.args[3], orientation: 'portrait', margin: '1cm' };
|
22
|
+
} else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
|
23
|
+
size = system.args[3].split('*');
|
24
|
+
if (size.length === 2) {
|
25
|
+
var pageWidth = parseInt(size[0], 10),
|
26
|
+
pageHeight = parseInt(size[1], 10);
|
27
|
+
page.viewportSize = { width: pageWidth, height: pageHeight };
|
28
|
+
page.clipRect = { top: 0, left: 0, width: pageWidth + 20, height: pageHeight };
|
29
|
+
} else {
|
30
|
+
console.log("size:", system.args[3]);
|
31
|
+
var pageWidth = parseInt(system.args[3], 10),
|
32
|
+
pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
|
33
|
+
console.log ("pageHeight:",pageHeight);
|
34
|
+
}
|
35
|
+
}
|
36
|
+
if (system.args.length > 4) {
|
37
|
+
page.zoomFactor = system.args[4];
|
38
|
+
}
|
39
|
+
page.open(address, function (status) {
|
40
|
+
if (status !== 'success') {
|
41
|
+
console.log('Unable to load the address!');
|
42
|
+
phantom.exit(1);
|
43
|
+
} else {
|
44
|
+
window.setTimeout(function () {
|
45
|
+
page.render(output);
|
46
|
+
phantom.exit();
|
47
|
+
}, 200);
|
48
|
+
}
|
49
|
+
});
|
50
|
+
}
|
@@ -2,6 +2,7 @@ require 'twitter'
|
|
2
2
|
|
3
3
|
# Finds tweetable articles, tweets them
|
4
4
|
class Tweet
|
5
|
+
attr_reader :result
|
5
6
|
# Find an article to tweet and tweet it
|
6
7
|
def self.anything
|
7
8
|
# Randomly tweet either the earlier tweetable Article in the database
|
@@ -20,13 +21,17 @@ class Tweet
|
|
20
21
|
###############
|
21
22
|
# Twitter API #
|
22
23
|
###############
|
23
|
-
def initialize(tweet,
|
24
|
-
if
|
24
|
+
def initialize(tweet, opts = {})
|
25
|
+
if opts[:commons_image]
|
26
|
+
filename = opts.delete(:commons_image)
|
25
27
|
Wiki.save_commons_image filename
|
26
|
-
TwitterClient.new.client.update_with_media(tweet, File.new(filename))
|
28
|
+
@result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
|
27
29
|
File.delete filename
|
30
|
+
elsif opts[:filename]
|
31
|
+
filename = opts.delete(:filename)
|
32
|
+
@result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
|
28
33
|
else
|
29
|
-
TwitterClient.new.client.update(tweet)
|
34
|
+
@result = TwitterClient.new.client.update(tweet, opts)
|
30
35
|
end
|
31
36
|
end
|
32
37
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikipedia_twitterbot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sage Ross
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pandoc-ruby
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
description:
|
126
140
|
email:
|
127
141
|
- sage@ragesoss.com
|
@@ -139,6 +153,7 @@ files:
|
|
139
153
|
- bin/setup
|
140
154
|
- lib/wikipedia_twitterbot.rb
|
141
155
|
- lib/wikipedia_twitterbot/article.rb
|
156
|
+
- lib/wikipedia_twitterbot/article_text_cleaner.rb
|
142
157
|
- lib/wikipedia_twitterbot/category_filter.rb
|
143
158
|
- lib/wikipedia_twitterbot/db/001_create_articles.rb
|
144
159
|
- lib/wikipedia_twitterbot/db/bootstrap.rb
|
@@ -147,6 +162,7 @@ files:
|
|
147
162
|
- lib/wikipedia_twitterbot/find_images.rb
|
148
163
|
- lib/wikipedia_twitterbot/high_pageviews.rb
|
149
164
|
- lib/wikipedia_twitterbot/ores.rb
|
165
|
+
- lib/wikipedia_twitterbot/rasterize.js
|
150
166
|
- lib/wikipedia_twitterbot/tweet.rb
|
151
167
|
- lib/wikipedia_twitterbot/twitter_client.rb
|
152
168
|
- lib/wikipedia_twitterbot/version.rb
|
@@ -173,7 +189,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
173
189
|
version: '0'
|
174
190
|
requirements: []
|
175
191
|
rubyforge_project:
|
176
|
-
rubygems_version: 2.
|
192
|
+
rubygems_version: 2.7.3
|
177
193
|
signing_key:
|
178
194
|
specification_version: 4
|
179
195
|
summary: Tools for building Wikipedia-focused Twitter bots
|