webwatchr 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,575 @@
1
+ require "digest/md5"
2
+ require "fileutils"
3
+ require "json"
4
+ require "logger"
5
+ require "net/http"
6
+ require "nokogiri"
7
+ require_relative "./logger"
8
+
9
+ class Site
10
+ include Loggable
11
+ class ParseError < StandardError
12
+ end
13
+
14
+ class RedirectError < StandardError
15
+ end
16
+
17
+ HTML_HEADER = "<!DOCTYPE html>\n<meta charset=\"utf-8\">\n".freeze
18
+ DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'.freeze
19
+
20
+ attr_accessor :url, :alerters, :rand_sleep, :every, :lastdir, :cache_dir, :state_file, :comment
21
+
22
+ attr_writer :name
23
+
24
+ def set(name, value)
25
+ instance_variable_set("@#{name}", value)
26
+ self
27
+ end
28
+
29
+ def name
30
+ @url.dup
31
+ end
32
+
33
+ def self.create(&block)
34
+ if block
35
+ new.instance_eval(&block)
36
+ else
37
+ new
38
+ end
39
+ end
40
+
41
+ def method_missing(attr, *args) # rubocop:disable Style/MissingRespondToMissing
42
+ if args.empty?
43
+ instance_variable_get("@#{attr}")
44
+ else
45
+ instance_variable_set("@#{attr}", *args)
46
+ self
47
+ end
48
+ end
49
+
50
+ def initialize()
51
+ @useragent = Site::DEFAULT_USER_AGENT
52
+ @extra_headers = {}
53
+ @alerters = []
54
+ @alert_only = []
55
+ @http_ver = 1
56
+ @rand_sleep = 0
57
+ @did_stuff = false
58
+ @every = 3600
59
+ end
60
+
61
+ def set_http_header(key, value)
62
+ @extra_headers[key] = value
63
+ end
64
+
65
+ def fetch_url(url, max_redir: 10)
66
+ if @http_ver == 2
67
+ return fetch_url2(url)
68
+ end
69
+
70
+ return fetch_url1(url, max_redir: max_redir)
71
+ end
72
+
73
+ # Helper methonds for generating HTML emails
74
+
75
+ def get_email_url()
76
+ return @url
77
+ end
78
+
79
+ def get_email_subject()
80
+ subject = "Update from #{self.class}"
81
+ if @comment
82
+ subject += " (#{@comment})"
83
+ end
84
+ return subject
85
+ end
86
+
87
+ def generate_html_content()
88
+ return nil unless @content
89
+
90
+ message_html = Site::HTML_HEADER.dup
91
+ message_html += @content
92
+ return message_html
93
+ end
94
+
95
+ # Helper methods to generate Telegram content
96
+ def generate_telegram_message_pieces()
97
+ return [@content]
98
+ end
99
+
100
+ def fetch_url2(url)
101
+ require "curb"
102
+
103
+ if @post_data
104
+ cmethod = Curl::Easy.method(:http_post)
105
+ params = [url, @post_data]
106
+ else
107
+ cmethod = Curl::Easy.method(:new)
108
+ params = [url]
109
+ end
110
+
111
+ c = cmethod.call(*params) do |curl|
112
+ curl.set(:HTTP_VERSION, Curl::HTTP_2_0)
113
+ if @useragent
114
+ curl.headers['User-Agent'] = @useragent
115
+ end
116
+ curl.verbose = true
117
+ @extra_headers.each do |k, v|
118
+ curl.headers[k] = v
119
+ end
120
+ end
121
+
122
+ c.perform
123
+ return c.body_str
124
+ end
125
+
126
+ def fetch_url1(url, max_redir: 10)
127
+ html = ""
128
+ uri = URI(url)
129
+ req = nil
130
+ http_o = Net::HTTP.new(uri.host, uri.port)
131
+ http_o.use_ssl = (uri.scheme == 'https')
132
+ http_o.set_debug_output $stderr if $VERBOSE
133
+ http_o.start do |http|
134
+ if @post_data
135
+ req = Net::HTTP::Post.new(uri)
136
+ req.set_form_data(@post_data)
137
+ elsif @post_json
138
+ req = Net::HTTP::Post.new(uri, 'Content-Type' => 'application/json')
139
+ req.body = if @post_json.instance_of?(String)
140
+ @post_json
141
+ else
142
+ @post_json.to_json
143
+ end
144
+
145
+ else
146
+ req = Net::HTTP::Get.new(uri)
147
+ end
148
+ if @useragent
149
+ req["User-Agent"] = @useragent
150
+ end
151
+ @extra_headers.each do |k, v|
152
+ req[k] = v
153
+ end
154
+ response = http.request(req)
155
+ case response.code
156
+ when "301", "302"
157
+ if max_redir == 0
158
+ raise Site::RedirectError
159
+ end
160
+
161
+ location = response["Location"]
162
+ unless location.start_with?("http")
163
+ location = if location.start_with?("/")
164
+ "#{uri.scheme}://#{uri.hostname}:#{uri.port}#{location}"
165
+ else
166
+ "#{uri.scheme}://#{uri.hostname}:#{uri.port}/#{location}"
167
+ end
168
+ end
169
+
170
+ logger.debug "Redirecting to #{location}"
171
+ return fetch_url(location, max_redir: max_redir - 1)
172
+ end
173
+
174
+ html = response.body
175
+
176
+ if html && (html =~ /meta http-equiv="refresh" content="0;URL='(.*)'/)
177
+ if max_redir == 0
178
+ raise Site::RedirectError
179
+ end
180
+
181
+ url = "#{uri.scheme}://#{uri.hostname}:#{uri.port}#{::Regexp.last_match(1)}"
182
+ logger.debug "Redirecting to #{location}"
183
+ return fetch_url(url, max_redir: max_redir - 1)
184
+ end
185
+
186
+ html = if html and response["Content-Encoding"]
187
+ html.force_encoding(response["Content-Encoding"])
188
+ else
189
+ html.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
190
+ end
191
+ end
192
+ logger.debug "Fetched #{url}"
193
+ return html
194
+ end
195
+
196
+ def parse_content(html)
197
+ return parse_noko(html)
198
+ end
199
+
200
+ def parse_noko(html)
201
+ noko = Nokogiri::HTML(html)
202
+ meta = noko.css("meta")
203
+ meta.each do |m|
204
+ if m['charset']
205
+ html = html.force_encoding(m['charset'])
206
+ end
207
+ end
208
+ return Nokogiri::HTML(html)
209
+ end
210
+
211
+ def load_state_file()
212
+ if @state_file and File.exist?(@state_file)
213
+ begin
214
+ return JSON.parse(File.read(@state_file), create_additions: true)
215
+ rescue JSON::ParserError
216
+ end
217
+ end
218
+ return {}
219
+ end
220
+
221
+ def save_state_file(hash)
222
+ File.open(@state_file, "w") do |f|
223
+ f.write JSON.pretty_generate(hash)
224
+ end
225
+ end
226
+
227
+ def update_state_file(hash)
228
+ previous_state = load_state_file()
229
+ previous_state.update({
230
+ "time" => Time.now.to_i,
231
+ "url" => @url,
232
+ "wait" => @wait
233
+ })
234
+ state = previous_state.update(hash)
235
+ save_state_file(state)
236
+ end
237
+
238
+ def alert()
239
+ logger.debug "Alerting new stuff"
240
+ @alerters.each do |alerter|
241
+ alerter.alert(self) unless @alert_only.include?(alerter.class::IDENTIFIER)
242
+ end
243
+ end
244
+
245
+ def content()
246
+ unless @did_stuff
247
+ raise StandardError, 'Trying to access @content, but we have not pulled any data yet'
248
+ end
249
+
250
+ return @content
251
+ end
252
+
253
+ def get_content()
254
+ return @html_content
255
+ end
256
+
257
+ def alert_only(alerter_identifiers)
258
+ if alerter_identifiers.instance_of?(Symbol)
259
+ @alert_only = [alerter_identifiers]
260
+ elsif alerter_identifiers.instance_of(Array)
261
+ @alert_only = alerter_identifiers
262
+ else
263
+ raise StandardError, "unknown type of provided alerter identifier #{alerter_identifiers}"
264
+ end
265
+ end
266
+
267
+ def should_update?(prevous_time)
268
+ return Time.now().to_i >= prevous_time + @wait
269
+ end
270
+
271
+ def get_new(_previous_content = nil)
272
+ @content = get_content()
273
+ return @content
274
+ end
275
+
276
+ def update(cache_dir:, last_dir:, test: false)
277
+ raise StandardError, "Didn't set URL for site #{self}" unless @url
278
+
279
+ md5 = Digest::MD5.hexdigest(@url)
280
+ @cache_dir = File.join(cache_dir, "cache-#{URI.parse(@url).hostname}-#{md5}")
281
+ @state_file = File.join(last_dir, "last-#{URI.parse(@url).hostname}-#{md5}")
282
+ state = load_state_file()
283
+ @wait = @every || state["wait"] || 60 * 60
284
+ @test = test
285
+ logger.debug "using #{@state_file} to store updates, and #{@cache_dir} for Cache"
286
+
287
+ do_stuff()
288
+ rescue Site::RedirectError
289
+ msg = "Error parsing page #{@url}, too many redirects"
290
+ msg += ". Will retry in #{@wait} + 30 minutes"
291
+ logger.error msg
292
+ warn msg
293
+ update_state_file({ "wait" => @wait + 30 * 60 })
294
+ rescue Site::ParseError => e
295
+ msg = "Error parsing page #{@url}"
296
+ if e.message
297
+ msg += " with error : #{e.message}"
298
+ end
299
+ msg += ". Will retry in #{@wait} + 30 minutes"
300
+ logger.error msg
301
+ warn msg
302
+ update_state_file({ "wait" => @wait + 30 * 60 })
303
+ rescue Errno::ECONNREFUSED, Net::ReadTimeout, OpenSSL::SSL::SSLError, Net::OpenTimeout => e
304
+ msg = "Network error on #{@url}"
305
+ if e.message
306
+ msg += " : #{e.message}"
307
+ end
308
+ msg += ". Will retry in #{@wait} + 30 minutes"
309
+ logger.error msg
310
+ warn msg
311
+ update_state_file({ "wait" => @wait + 30 * 60 })
312
+ end
313
+
314
+ def pull_things()
315
+ @html_content = fetch_url(@url)
316
+ @parsed_content = parse_content(@html_content)
317
+ end
318
+
319
+ def do_stuff()
320
+ new_stuff = false
321
+ previous_state = {
322
+ "time" => -9_999_999_999_999,
323
+ "content" => nil
324
+ }
325
+ state = load_state_file()
326
+ if state
327
+ previous_state.update(state)
328
+ end
329
+ previous_content = previous_state["content"]
330
+ if should_update?(previous_state["time"]) or @test
331
+ if @rand_sleep > 0 and not @test
332
+ logger.info "Time to update #{@url} (sleeping #{@rand_sleep} sec)"
333
+ sleep(@rand_sleep)
334
+ else
335
+ logger.info "Time to update #{@url}"
336
+ end
337
+ pull_things()
338
+ new_stuff = get_new(previous_content)
339
+ @did_stuff = true
340
+ if new_stuff
341
+ if @test
342
+ logger.info "Would have alerted with new stuff:\n#{new_stuff}"
343
+ else
344
+ alert()
345
+ update_state_file({
346
+ "content" => new_stuff,
347
+ "previous_content" => previous_content
348
+ })
349
+ end
350
+ else
351
+ logger.info "Nothing new for #{@url}"
352
+ if @test
353
+ logger.info "Current state is still :\n#{@content}"
354
+ end
355
+ end
356
+ update_state_file({}) unless @test
357
+ else
358
+ @did_stuff = true
359
+ logger.info "Too soon to update #{@url}"
360
+ end
361
+ end
362
+
363
+ class SimpleString < Site
364
+ class ResultObject
365
+ attr_accessor :message
366
+
367
+ def initialize(message = '')
368
+ @message = message
369
+ end
370
+
371
+ def to_telegram()
372
+ return @message
373
+ end
374
+
375
+ def to_s
376
+ return @message
377
+ end
378
+
379
+ def to_html()
380
+ return @message
381
+ end
382
+
383
+ def to_json(*args)
384
+ {
385
+ JSON.create_id => self.class.name,
386
+ 'message' => @message
387
+ }.to_json(*args)
388
+ end
389
+
390
+ def self.json_create(object)
391
+ new(*object['message'])
392
+ end
393
+
394
+ def ==(other)
395
+ self.class == other.class &&
396
+ @message == other.message
397
+ end
398
+ end
399
+
400
+ def get_new(previous_content = nil)
401
+ # Is a ResultObject
402
+ if @content
403
+ raise StandardError, "The result of get_content() should be a ResultObject if the Site class is SimpleString" unless @content.class < ResultObject
404
+ else
405
+ @content = get_content()
406
+ end
407
+ return nil if @content == previous_content
408
+
409
+ return @content
410
+ end
411
+
412
+ def generate_html_content()
413
+ return nil unless @content
414
+
415
+ message_html = Site::HTML_HEADER.dup
416
+ if @content.is_a?(ResultObject)
417
+ message_html += @content.to_html
418
+ else
419
+ message_html += @content
420
+ end
421
+ return message_html
422
+ end
423
+
424
+ def generate_telegram_message_pieces()
425
+ return [@content.is_a?(ResultObject) ? @content.to_telegram : @content]
426
+ end
427
+ end
428
+
429
+ class DiffString < SimpleString
430
+ begin
431
+ require "diffy"
432
+
433
+ def generate_html_content()
434
+ diff_html = Site::HTML_HEADER.dup
435
+ diff_html += "<head><style>"
436
+ diff_html += Diffy::CSS
437
+ diff_html += "</style><body>"
438
+ diff_html += @diffed.to_s(:html)
439
+ diff_html += "</body></html>"
440
+ return diff_html
441
+ end
442
+
443
+ def get_differ(previous, new)
444
+ return Diffy::Diff.new(previous, new)
445
+ end
446
+ rescue LoadError
447
+ require "test/unit/diff"
448
+ def generate_html_content()
449
+ diff_html = Site::HTML_HEADER.dup
450
+ diff_html += @diffed.to_s
451
+ diff_html += "</body></html>"
452
+ return diff_html
453
+ end
454
+
455
+ def get_differ(previous, new)
456
+ return new unless previous
457
+
458
+ return Test::Unit::Diff.unified(previous, new)
459
+ end
460
+ end
461
+
462
+ def get_new(previous_content = nil)
463
+ new_stuff = nil
464
+ @content = get_content()
465
+ unless @content
466
+ return nil
467
+ end
468
+
469
+ if @content != previous_content
470
+ @diffed = get_differ(previous_content, @content)
471
+ new_stuff = @diffed.to_s
472
+ end
473
+ return new_stuff
474
+ end
475
+ end
476
+
477
+ class Articles < Site
478
+ def initialize
479
+ super
480
+ @content = []
481
+ end
482
+
483
+ def validate(item)
484
+ raise StandardError, "Needs at least \"id\" key" unless item["id"]
485
+
486
+ id = item["id"]
487
+ raise StandardError, "\"id\" key needs to be a String and not #{id.class}" unless id.is_a?(String)
488
+ end
489
+
490
+ def add_article(item)
491
+ logger.debug "Found article #{item['id']}"
492
+ validate(item)
493
+ item["_timestamp"] = Time.now().to_i
494
+ @content << item unless @content.map { |x| x['id'] }.include?(item['id'])
495
+ end
496
+
497
+ def get_new(previous_content)
498
+ new_stuff = []
499
+ get_content()
500
+ unless @content
501
+ return nil
502
+ end
503
+
504
+ if previous_content
505
+ previous_ids = previous_content.map { |h| h["id"] }
506
+ new_stuff = @content.delete_if { |item| previous_ids.include?(item["id"]) }
507
+ else
508
+ new_stuff = @content
509
+ end
510
+ if (not new_stuff) or new_stuff.empty?
511
+ return nil
512
+ end
513
+
514
+ return new_stuff
515
+ end
516
+
517
+ def update_state_file(hash)
518
+ hash_content = hash["content"]
519
+ hash.delete("content")
520
+ previous_state = load_state_file()
521
+ previous_state.update({
522
+ "time" => Time.now.to_i,
523
+ "url" => @url,
524
+ "wait" => @wait
525
+ })
526
+ state = previous_state.update(hash)
527
+ if hash_content
528
+ (previous_state["content"] ||= []).concat(hash_content)
529
+ end
530
+ save_state_file(state)
531
+ end
532
+
533
+ def generate_html_content()
534
+ message_html = Site::HTML_HEADER.dup
535
+ message_html << "<ul style='list-style-type: none;'>\n"
536
+ @content.each do |item|
537
+ msg = "<li id='#{item['id']}'>"
538
+ if item["url"]
539
+ msg += "<a href='#{item['url']}'>"
540
+ end
541
+ if item["img_src"]
542
+ msg += "<img style='width:100px' src='#{item['img_src']}'/>"
543
+ end
544
+ if item["title"]
545
+ msg += item['title'].to_s
546
+ end
547
+ if item["url"]
548
+ msg += "</a>"
549
+ end
550
+ msg += "</li>\n"
551
+ message_html += msg
552
+ end
553
+ message_html += "</ul>"
554
+ return message_html
555
+ end
556
+
557
+ def generate_telegram_message_pieces()
558
+ msg_pieces = []
559
+ @content.each do |item|
560
+ line = item["title"]
561
+ if item["url"]
562
+ if line
563
+ line += ": #{item['url']}"
564
+ else
565
+ line = item["url"]
566
+ end
567
+
568
+ line += ": #{item['url']}"
569
+ end
570
+ msg_pieces << line
571
+ end
572
+ return msg_pieces
573
+ end
574
+ end
575
+ end
data/lib/webwatchr.rb ADDED
@@ -0,0 +1 @@
1
+ require "webwatchr/main"
data/tests/helpers.rb ADDED
@@ -0,0 +1,32 @@
1
+ require "fileutils"
2
+ require "tmpdir"
3
+ require "test/unit"
4
+
5
+ require_relative "../lib/webwatchr/alerting"
6
+
7
+ class ArticleSiteTest < Test::Unit::TestCase
8
+ def fakeupdate(site)
9
+ workdir = Dir.mktmpdir('fakesite')
10
+ cache_dir = File.join(workdir, 'cache')
11
+ last_dir = File.join(workdir, 'last')
12
+
13
+ FileUtils.mkdir_p(cache_dir)
14
+ FileUtils.mkdir_p(last_dir)
15
+ site.update(cache_dir: cache_dir, last_dir: last_dir)
16
+ FileUtils.rm_rf(workdir)
17
+ end
18
+ end
19
+
20
+ class TestAlerter < Webwatchr::Alerting::Base
21
+ IDENTIFIER = :testtest
22
+ attr_accessor :result
23
+
24
+ def initialize
25
+ super()
26
+ @result = nil
27
+ end
28
+
29
+ def alert(site)
30
+ @result = site.content
31
+ end
32
+ end