wikipedia_twitterbot 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 1330f2a58c111f7cee263a8746cd241424ce78cd
4
- data.tar.gz: 4ba11d3e6b833f66de96f57bb77150af90216f74
2
+ SHA256:
3
+ metadata.gz: 35215d526e53248f263a20c08f6b737fdb09ea6df5b814543f9d6a55f19d69cd
4
+ data.tar.gz: fc3db6117288c4cb6531dc988278bfa39c1990ca598b7d583391c6607138ef7d
5
5
  SHA512:
6
- metadata.gz: 87da8c83217ea2f27a2150cdbfd9ca4505daf0493e18d96a61fadcadc58b8bf038d954ba0c64018fba1c2a308a8afb2909ff5fd5c8abc6551f21549d21386881
7
- data.tar.gz: f82241415cfa16ec0f9c8c1670583e5ce8a2defbe85f00491f1f4b25945cf4192d0dc8a837f52f2474f841cae82fc451289e987e3eb3ac3698ed0ef2231e1503
6
+ metadata.gz: fce11b334834aa0d8d3f296edd55082d2f3c32bf70dbabc0271eeb09a339862cfec4c95adf382707d5fd790d7224d18b4fb84e3c79709ae57aa42e395d64c578
7
+ data.tar.gz: aa7dc3aac490003622aad8680847af3e0ffd9500bba5dd8bd20ab28890b09a64c7cf17a047e8bcc11699bd0fd8ae83d03664edb43866d435553cd694c9456932
data/.gitignore CHANGED
@@ -7,3 +7,4 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
+ twitter.yml
@@ -2,9 +2,11 @@ require 'active_record'
2
2
  require 'activerecord-import'
3
3
  require 'sqlite3'
4
4
  require 'logger'
5
+ require 'fileutils'
5
6
  require_relative 'tweet'
6
7
  require_relative 'twitter_client'
7
8
  require_relative 'find_images'
9
+ require_relative 'article_text_cleaner'
8
10
 
9
11
  class Article < ActiveRecord::Base
10
12
  class << self
@@ -88,11 +90,12 @@ class Article < ActiveRecord::Base
88
90
  ####################
89
91
  # Instance methods #
90
92
  ####################
91
- def tweet(tweet_text)
92
- Tweet.new(tweet_text, filename: @image)
93
+ def tweet(tweet_text, opts = {})
94
+ @tweet_result = Tweet.new(tweet_text, opts).result
93
95
  self.tweeted = true
94
96
  save
95
- 'tweeted'
97
+ pp 'tweeted'
98
+ @tweet_result
96
99
  rescue StandardError => e
97
100
  self.failed_tweet_at = Time.now
98
101
  save
@@ -100,6 +103,7 @@ class Article < ActiveRecord::Base
100
103
  end
101
104
 
102
105
  def screenshot_path
106
+ FileUtils.mkdir_p('screenshots') unless File.directory?('screenshots')
103
107
  "screenshots/#{escaped_title}.png"
104
108
  end
105
109
 
@@ -134,10 +138,16 @@ class Article < ActiveRecord::Base
134
138
  "https://en.wikipedia.org/wiki/#{escaped_title}?veaction=edit&summary=%23#{bot_name}"
135
139
  end
136
140
 
141
+ def dirp
142
+ pp RASTERIZE_PATH
143
+ end
144
+
145
+ RASTERIZE_PATH = "#{__dir__}/rasterize.js".freeze
137
146
  def make_screenshot
138
- webshot = Webshot::Screenshot.instance
139
- webshot.capture mobile_url, "public/#{screenshot_path}",
140
- width: 800, height: 800, allowed_status_codes: [404]
147
+ # Use rasterize script to make a screenshot
148
+ %x[phantomjs #{RASTERIZE_PATH} #{mobile_url} #{screenshot_path} 1000px*1000px]
149
+ # Trim any extra blank space, which may or may not be present.
150
+ %x[convert #{screenshot_path} -trim #{screenshot_path}]
141
151
  end
142
152
 
143
153
  def hashtag
@@ -148,5 +158,26 @@ class Article < ActiveRecord::Base
148
158
  self.class.bot_name
149
159
  end
150
160
 
161
+ def wikilinks
162
+ return @links if @links.present?
163
+ query = { prop: 'links', titles: title, plnamespace: '0', pllimit: 500 }
164
+ response = Wiki.query query
165
+ @links = response.data['pages'].values.first['links'].map { |link| link['title'] }
166
+ @links
167
+ end
168
+
169
+ def page_text
170
+ @page_text ||= Wiki.get_page_content title
171
+ end
172
+
173
+ def plaintext
174
+ @plaintext = ArticleTextCleaner.convert(page_text)
175
+ end
176
+
177
+ def sentence_with(text)
178
+ # TODO: Remove the plaintext footnote remnants
179
+ plaintext[/[^.?!\n]*#{Regexp.quote text}[^.?!]*[.?!]/i]
180
+ end
181
+
151
182
  class NoImageError < StandardError; end
152
183
  end
@@ -0,0 +1,29 @@
1
+ require 'pandoc-ruby'
2
+
3
+ class ArticleTextCleaner
4
+ def self.convert(page_text)
5
+ new(page_text).convert
6
+ end
7
+
8
+ def initialize(page_text)
9
+ @page_text = page_text
10
+ end
11
+
12
+ def convert
13
+ @output = PandocRuby.new(@page_text, from: :mediawiki, to: :plain).convert
14
+ remove_refs
15
+ replace_single_linebreaks
16
+ @output
17
+ end
18
+
19
+ # Refs in up in plaintext as: [12]
20
+ def remove_refs
21
+ @output.gsub!(/\[\d+\]/, '')
22
+ end
23
+
24
+ # Linebreaks just for line wrapping appear where spaces should be.
25
+ # Double line breaks happen between paragraphs; leave those in place.
26
+ def replace_single_linebreaks
27
+ @output.gsub!(/(?<!\n)\n(?!\n)/, ' ')
28
+ end
29
+ end
@@ -30,10 +30,38 @@ class FindArticles
30
30
  by_ids(ids)
31
31
  end
32
32
 
33
+ def self.by_title(title)
34
+ existing = Article.find_by(title: title)
35
+ return existing if existing.present?
36
+ page_data = Wiki.query title_info_query(title)
37
+ article_data = page_data.data['pages'].values.first
38
+ article = Article.new(id: article_data['pageid'],
39
+ title: article_data['title'],
40
+ latest_revision: article_data['lastrevid'],
41
+ latest_revision_datetime: article_data['touched'])
42
+
43
+ return article unless article_data['redirect']
44
+
45
+ # If it's a redirect, return the redirect target instead.
46
+ redirect_target = article.wikilinks.first
47
+ return by_title(redirect_target)
48
+ end
49
+
33
50
  ####################
34
51
  # Internal methods #
35
52
  ####################
36
53
 
54
+ def self.title_revisions_query(title)
55
+ { prop: 'revisions',
56
+ titles: title,
57
+ rvprop: 'userid|ids|timestamp' }
58
+ end
59
+
60
+ def self.title_info_query(title)
61
+ { prop: 'info',
62
+ titles: title }
63
+ end
64
+
37
65
  def self.revisions_query(article_ids)
38
66
  { prop: 'revisions',
39
67
  pageids: article_ids,
@@ -1,6 +1,5 @@
1
1
  class FindImages
2
2
  def self.first(article)
3
- page_text = Wiki.get_page_content article.title
4
- page_text[/File:.{,60}\.jpg/]
3
+ article.page_text[/File:.{,60}\.jpg/]
5
4
  end
6
5
  end
@@ -0,0 +1,50 @@
1
+ // Adapted slightly from https://github.com/ariya/phantomjs/blob/master/examples/rasterize.js
2
+ // License: 3-clause BSD https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD
3
+ "use strict";
4
+ var page = require('webpage').create(),
5
+ system = require('system'),
6
+ address, output, size, pageWidth, pageHeight;
7
+
8
+ if (system.args.length < 3 || system.args.length > 5) {
9
+ console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
10
+ console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
11
+ console.log(' image (png/jpg output) examples: "1920px" entire page, window width 1920px');
12
+ console.log(' "800px*600px" window, clipped to 800x600');
13
+ phantom.exit(1);
14
+ } else {
15
+ address = system.args[1];
16
+ output = system.args[2];
17
+ page.viewportSize = { width: 600, height: 600 };
18
+ if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
19
+ size = system.args[3].split('*');
20
+ page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
21
+ : { format: system.args[3], orientation: 'portrait', margin: '1cm' };
22
+ } else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
23
+ size = system.args[3].split('*');
24
+ if (size.length === 2) {
25
+ var pageWidth = parseInt(size[0], 10),
26
+ pageHeight = parseInt(size[1], 10);
27
+ page.viewportSize = { width: pageWidth, height: pageHeight };
28
+ page.clipRect = { top: 0, left: 0, width: pageWidth + 20, height: pageHeight };
29
+ } else {
30
+ console.log("size:", system.args[3]);
31
+ var pageWidth = parseInt(system.args[3], 10),
32
+ pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
33
+ console.log ("pageHeight:",pageHeight);
34
+ }
35
+ }
36
+ if (system.args.length > 4) {
37
+ page.zoomFactor = system.args[4];
38
+ }
39
+ page.open(address, function (status) {
40
+ if (status !== 'success') {
41
+ console.log('Unable to load the address!');
42
+ phantom.exit(1);
43
+ } else {
44
+ window.setTimeout(function () {
45
+ page.render(output);
46
+ phantom.exit();
47
+ }, 200);
48
+ }
49
+ });
50
+ }
@@ -2,6 +2,7 @@ require 'twitter'
2
2
 
3
3
  # Finds tweetable articles, tweets them
4
4
  class Tweet
5
+ attr_reader :result
5
6
  # Find an article to tweet and tweet it
6
7
  def self.anything
7
8
  # Randomly tweet either the earlier tweetable Article in the database
@@ -20,13 +21,17 @@ class Tweet
20
21
  ###############
21
22
  # Twitter API #
22
23
  ###############
23
- def initialize(tweet, filename: nil)
24
- if filename
24
+ def initialize(tweet, opts = {})
25
+ if opts[:commons_image]
26
+ filename = opts.delete(:commons_image)
25
27
  Wiki.save_commons_image filename
26
- TwitterClient.new.client.update_with_media(tweet, File.new(filename))
28
+ @result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
27
29
  File.delete filename
30
+ elsif opts[:filename]
31
+ filename = opts.delete(:filename)
32
+ @result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
28
33
  else
29
- TwitterClient.new.client.update(tweet)
34
+ @result = TwitterClient.new.client.update(tweet, opts)
30
35
  end
31
36
  end
32
37
 
@@ -30,4 +30,8 @@ class TwitterClient
30
30
  def hashtags_in(text)
31
31
  text.scan(/\s(#\w+)/).flatten
32
32
  end
33
+
34
+ def trends
35
+ @client.trends.map(&:name)
36
+ end
33
37
  end
@@ -1,3 +1,3 @@
1
1
  module WikipediaTwitterbot
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.2.0'.freeze
3
3
  end
@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
30
30
  spec.add_runtime_dependency 'twitter'
31
31
  spec.add_runtime_dependency 'mediawiki_api'
32
32
  spec.add_runtime_dependency 'logger'
33
+ spec.add_runtime_dependency 'pandoc-ruby'
33
34
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia_twitterbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sage Ross
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-27 00:00:00.000000000 Z
11
+ date: 2018-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pandoc-ruby
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  description:
126
140
  email:
127
141
  - sage@ragesoss.com
@@ -139,6 +153,7 @@ files:
139
153
  - bin/setup
140
154
  - lib/wikipedia_twitterbot.rb
141
155
  - lib/wikipedia_twitterbot/article.rb
156
+ - lib/wikipedia_twitterbot/article_text_cleaner.rb
142
157
  - lib/wikipedia_twitterbot/category_filter.rb
143
158
  - lib/wikipedia_twitterbot/db/001_create_articles.rb
144
159
  - lib/wikipedia_twitterbot/db/bootstrap.rb
@@ -147,6 +162,7 @@ files:
147
162
  - lib/wikipedia_twitterbot/find_images.rb
148
163
  - lib/wikipedia_twitterbot/high_pageviews.rb
149
164
  - lib/wikipedia_twitterbot/ores.rb
165
+ - lib/wikipedia_twitterbot/rasterize.js
150
166
  - lib/wikipedia_twitterbot/tweet.rb
151
167
  - lib/wikipedia_twitterbot/twitter_client.rb
152
168
  - lib/wikipedia_twitterbot/version.rb
@@ -173,7 +189,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
173
189
  version: '0'
174
190
  requirements: []
175
191
  rubyforge_project:
176
- rubygems_version: 2.6.8
192
+ rubygems_version: 2.7.3
177
193
  signing_key:
178
194
  specification_version: 4
179
195
  summary: Tools for building Wikipedia-focused Twitter bots