wikipedia_twitterbot 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 1330f2a58c111f7cee263a8746cd241424ce78cd
4
- data.tar.gz: 4ba11d3e6b833f66de96f57bb77150af90216f74
2
+ SHA256:
3
+ metadata.gz: 35215d526e53248f263a20c08f6b737fdb09ea6df5b814543f9d6a55f19d69cd
4
+ data.tar.gz: fc3db6117288c4cb6531dc988278bfa39c1990ca598b7d583391c6607138ef7d
5
5
  SHA512:
6
- metadata.gz: 87da8c83217ea2f27a2150cdbfd9ca4505daf0493e18d96a61fadcadc58b8bf038d954ba0c64018fba1c2a308a8afb2909ff5fd5c8abc6551f21549d21386881
7
- data.tar.gz: f82241415cfa16ec0f9c8c1670583e5ce8a2defbe85f00491f1f4b25945cf4192d0dc8a837f52f2474f841cae82fc451289e987e3eb3ac3698ed0ef2231e1503
6
+ metadata.gz: fce11b334834aa0d8d3f296edd55082d2f3c32bf70dbabc0271eeb09a339862cfec4c95adf382707d5fd790d7224d18b4fb84e3c79709ae57aa42e395d64c578
7
+ data.tar.gz: aa7dc3aac490003622aad8680847af3e0ffd9500bba5dd8bd20ab28890b09a64c7cf17a047e8bcc11699bd0fd8ae83d03664edb43866d435553cd694c9456932
data/.gitignore CHANGED
@@ -7,3 +7,4 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
+ twitter.yml
@@ -2,9 +2,11 @@ require 'active_record'
2
2
  require 'activerecord-import'
3
3
  require 'sqlite3'
4
4
  require 'logger'
5
+ require 'fileutils'
5
6
  require_relative 'tweet'
6
7
  require_relative 'twitter_client'
7
8
  require_relative 'find_images'
9
+ require_relative 'article_text_cleaner'
8
10
 
9
11
  class Article < ActiveRecord::Base
10
12
  class << self
@@ -88,11 +90,12 @@ class Article < ActiveRecord::Base
88
90
  ####################
89
91
  # Instance methods #
90
92
  ####################
91
- def tweet(tweet_text)
92
- Tweet.new(tweet_text, filename: @image)
93
+ def tweet(tweet_text, opts = {})
94
+ @tweet_result = Tweet.new(tweet_text, opts).result
93
95
  self.tweeted = true
94
96
  save
95
- 'tweeted'
97
+ pp 'tweeted'
98
+ @tweet_result
96
99
  rescue StandardError => e
97
100
  self.failed_tweet_at = Time.now
98
101
  save
@@ -100,6 +103,7 @@ class Article < ActiveRecord::Base
100
103
  end
101
104
 
102
105
  def screenshot_path
106
+ FileUtils.mkdir_p('screenshots') unless File.directory?('screenshots')
103
107
  "screenshots/#{escaped_title}.png"
104
108
  end
105
109
 
@@ -134,10 +138,16 @@ class Article < ActiveRecord::Base
134
138
  "https://en.wikipedia.org/wiki/#{escaped_title}?veaction=edit&summary=%23#{bot_name}"
135
139
  end
136
140
 
141
+ def dirp
142
+ pp RASTERIZE_PATH
143
+ end
144
+
145
+ RASTERIZE_PATH = "#{__dir__}/rasterize.js".freeze
137
146
  def make_screenshot
138
- webshot = Webshot::Screenshot.instance
139
- webshot.capture mobile_url, "public/#{screenshot_path}",
140
- width: 800, height: 800, allowed_status_codes: [404]
147
+ # Use rasterize script to make a screenshot
148
+ %x[phantomjs #{RASTERIZE_PATH} #{mobile_url} #{screenshot_path} 1000px*1000px]
149
+ # Trim any extra blank space, which may or may not be present.
150
+ %x[convert #{screenshot_path} -trim #{screenshot_path}]
141
151
  end
142
152
 
143
153
  def hashtag
@@ -148,5 +158,26 @@ class Article < ActiveRecord::Base
148
158
  self.class.bot_name
149
159
  end
150
160
 
161
+ def wikilinks
162
+ return @links if @links.present?
163
+ query = { prop: 'links', titles: title, plnamespace: '0', pllimit: 500 }
164
+ response = Wiki.query query
165
+ @links = response.data['pages'].values.first['links'].map { |link| link['title'] }
166
+ @links
167
+ end
168
+
169
+ def page_text
170
+ @page_text ||= Wiki.get_page_content title
171
+ end
172
+
173
+ def plaintext
174
+ @plaintext = ArticleTextCleaner.convert(page_text)
175
+ end
176
+
177
+ def sentence_with(text)
178
+ # TODO: Remove the plaintext footnote remnants
179
+ plaintext[/[^.?!\n]*#{Regexp.quote text}[^.?!]*[.?!]/i]
180
+ end
181
+
151
182
  class NoImageError < StandardError; end
152
183
  end
@@ -0,0 +1,29 @@
1
+ require 'pandoc-ruby'
2
+
3
+ class ArticleTextCleaner
4
+ def self.convert(page_text)
5
+ new(page_text).convert
6
+ end
7
+
8
+ def initialize(page_text)
9
+ @page_text = page_text
10
+ end
11
+
12
+ def convert
13
+ @output = PandocRuby.new(@page_text, from: :mediawiki, to: :plain).convert
14
+ remove_refs
15
+ replace_single_linebreaks
16
+ @output
17
+ end
18
+
19
+ # Refs in up in plaintext as: [12]
20
+ def remove_refs
21
+ @output.gsub!(/\[\d+\]/, '')
22
+ end
23
+
24
+ # Linebreaks just for line wrapping appear where spaces should be.
25
+ # Double line breaks happen between paragraphs; leave those in place.
26
+ def replace_single_linebreaks
27
+ @output.gsub!(/(?<!\n)\n(?!\n)/, ' ')
28
+ end
29
+ end
@@ -30,10 +30,38 @@ class FindArticles
30
30
  by_ids(ids)
31
31
  end
32
32
 
33
+ def self.by_title(title)
34
+ existing = Article.find_by(title: title)
35
+ return existing if existing.present?
36
+ page_data = Wiki.query title_info_query(title)
37
+ article_data = page_data.data['pages'].values.first
38
+ article = Article.new(id: article_data['pageid'],
39
+ title: article_data['title'],
40
+ latest_revision: article_data['lastrevid'],
41
+ latest_revision_datetime: article_data['touched'])
42
+
43
+ return article unless article_data['redirect']
44
+
45
+ # If it's a redirect, return the redirect target instead.
46
+ redirect_target = article.wikilinks.first
47
+ return by_title(redirect_target)
48
+ end
49
+
33
50
  ####################
34
51
  # Internal methods #
35
52
  ####################
36
53
 
54
+ def self.title_revisions_query(title)
55
+ { prop: 'revisions',
56
+ titles: title,
57
+ rvprop: 'userid|ids|timestamp' }
58
+ end
59
+
60
+ def self.title_info_query(title)
61
+ { prop: 'info',
62
+ titles: title }
63
+ end
64
+
37
65
  def self.revisions_query(article_ids)
38
66
  { prop: 'revisions',
39
67
  pageids: article_ids,
@@ -1,6 +1,5 @@
1
1
  class FindImages
2
2
  def self.first(article)
3
- page_text = Wiki.get_page_content article.title
4
- page_text[/File:.{,60}\.jpg/]
3
+ article.page_text[/File:.{,60}\.jpg/]
5
4
  end
6
5
  end
@@ -0,0 +1,50 @@
1
+ // Adapted slightly from https://github.com/ariya/phantomjs/blob/master/examples/rasterize.js
2
+ // License: 3-clause BSD https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD
3
+ "use strict";
4
+ var page = require('webpage').create(),
5
+ system = require('system'),
6
+ address, output, size, pageWidth, pageHeight;
7
+
8
+ if (system.args.length < 3 || system.args.length > 5) {
9
+ console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
10
+ console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
11
+ console.log(' image (png/jpg output) examples: "1920px" entire page, window width 1920px');
12
+ console.log(' "800px*600px" window, clipped to 800x600');
13
+ phantom.exit(1);
14
+ } else {
15
+ address = system.args[1];
16
+ output = system.args[2];
17
+ page.viewportSize = { width: 600, height: 600 };
18
+ if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
19
+ size = system.args[3].split('*');
20
+ page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
21
+ : { format: system.args[3], orientation: 'portrait', margin: '1cm' };
22
+ } else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
23
+ size = system.args[3].split('*');
24
+ if (size.length === 2) {
25
+ var pageWidth = parseInt(size[0], 10),
26
+ pageHeight = parseInt(size[1], 10);
27
+ page.viewportSize = { width: pageWidth, height: pageHeight };
28
+ page.clipRect = { top: 0, left: 0, width: pageWidth + 20, height: pageHeight };
29
+ } else {
30
+ console.log("size:", system.args[3]);
31
+ var pageWidth = parseInt(system.args[3], 10),
32
+ pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
33
+ console.log ("pageHeight:",pageHeight);
34
+ }
35
+ }
36
+ if (system.args.length > 4) {
37
+ page.zoomFactor = system.args[4];
38
+ }
39
+ page.open(address, function (status) {
40
+ if (status !== 'success') {
41
+ console.log('Unable to load the address!');
42
+ phantom.exit(1);
43
+ } else {
44
+ window.setTimeout(function () {
45
+ page.render(output);
46
+ phantom.exit();
47
+ }, 200);
48
+ }
49
+ });
50
+ }
@@ -2,6 +2,7 @@ require 'twitter'
2
2
 
3
3
  # Finds tweetable articles, tweets them
4
4
  class Tweet
5
+ attr_reader :result
5
6
  # Find an article to tweet and tweet it
6
7
  def self.anything
7
8
  # Randomly tweet either the earlier tweetable Article in the database
@@ -20,13 +21,17 @@ class Tweet
20
21
  ###############
21
22
  # Twitter API #
22
23
  ###############
23
- def initialize(tweet, filename: nil)
24
- if filename
24
+ def initialize(tweet, opts = {})
25
+ if opts[:commons_image]
26
+ filename = opts.delete(:commons_image)
25
27
  Wiki.save_commons_image filename
26
- TwitterClient.new.client.update_with_media(tweet, File.new(filename))
28
+ @result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
27
29
  File.delete filename
30
+ elsif opts[:filename]
31
+ filename = opts.delete(:filename)
32
+ @result = TwitterClient.new.client.update_with_media(tweet, File.new(filename), opts)
28
33
  else
29
- TwitterClient.new.client.update(tweet)
34
+ @result = TwitterClient.new.client.update(tweet, opts)
30
35
  end
31
36
  end
32
37
 
@@ -30,4 +30,8 @@ class TwitterClient
30
30
  def hashtags_in(text)
31
31
  text.scan(/\s(#\w+)/).flatten
32
32
  end
33
+
34
+ def trends
35
+ @client.trends.map(&:name)
36
+ end
33
37
  end
@@ -1,3 +1,3 @@
1
1
  module WikipediaTwitterbot
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.2.0'.freeze
3
3
  end
@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
30
30
  spec.add_runtime_dependency 'twitter'
31
31
  spec.add_runtime_dependency 'mediawiki_api'
32
32
  spec.add_runtime_dependency 'logger'
33
+ spec.add_runtime_dependency 'pandoc-ruby'
33
34
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia_twitterbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sage Ross
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-27 00:00:00.000000000 Z
11
+ date: 2018-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pandoc-ruby
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  description:
126
140
  email:
127
141
  - sage@ragesoss.com
@@ -139,6 +153,7 @@ files:
139
153
  - bin/setup
140
154
  - lib/wikipedia_twitterbot.rb
141
155
  - lib/wikipedia_twitterbot/article.rb
156
+ - lib/wikipedia_twitterbot/article_text_cleaner.rb
142
157
  - lib/wikipedia_twitterbot/category_filter.rb
143
158
  - lib/wikipedia_twitterbot/db/001_create_articles.rb
144
159
  - lib/wikipedia_twitterbot/db/bootstrap.rb
@@ -147,6 +162,7 @@ files:
147
162
  - lib/wikipedia_twitterbot/find_images.rb
148
163
  - lib/wikipedia_twitterbot/high_pageviews.rb
149
164
  - lib/wikipedia_twitterbot/ores.rb
165
+ - lib/wikipedia_twitterbot/rasterize.js
150
166
  - lib/wikipedia_twitterbot/tweet.rb
151
167
  - lib/wikipedia_twitterbot/twitter_client.rb
152
168
  - lib/wikipedia_twitterbot/version.rb
@@ -173,7 +189,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
173
189
  version: '0'
174
190
  requirements: []
175
191
  rubyforge_project:
176
- rubygems_version: 2.6.8
192
+ rubygems_version: 2.7.3
177
193
  signing_key:
178
194
  specification_version: 4
179
195
  summary: Tools for building Wikipedia-focused Twitter bots