webwatchr 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1675ffe16b57de72800f93dfa9a81a83475e5f815e3a40a3df5175f7d5f0ce69
4
- data.tar.gz: 1936758eb8f893715b7f91337358a0f34b3ea50ff5dca4f8d39eeb2b7119d46f
3
+ metadata.gz: a2ceb7b164a65a05718c637bf20288b99bd6dc71f8e048508e1e5e45db03b351
4
+ data.tar.gz: 7f7356e8e9806d1cca36cae95633a0de35c5ac0881061f3dd44de415144ee3c2
5
5
  SHA512:
6
- metadata.gz: 48680b66395e44a11241bf873018aa447f5e2ecabca4862dfc870209dfcb04bc5c35023af7f2dde6f0129344cfc6186b12abae0b3ae23089b33ef74ee00e19a6
7
- data.tar.gz: 6afdf42375ca35935340649188dc2bccbeefade1ddec65ea2c56d0f601db7d24ddf48e49251fa80a0b7739fee35ad1ee914c62883ef4b40805c68930b3368e42
6
+ metadata.gz: 5a905d1ec16cedc8e60aba00f1c1a898fd9282a1d5a159a44467feb11d22fa6f75c92df016d3cb01a40679ea8a91dbf5a459e9ed6592f4417ddb2ad0b6c3c113
7
+ data.tar.gz: 6b47cc97bc17c24b1589b1fc0ecb1ea51b6eef14e2c01a77cecc2583df6d21f6ed36cf98d98fc237cccd7909a27e56b3b5a58dc067b4c2667afe5fd912cdec8a
data/README.md CHANGED
@@ -6,7 +6,8 @@ Silly script to periodically check webpage changes.
6
6
  2. pulls data for every Website to check, if the last time we did that is long ago
7
7
  4. if content is different, from the last time, alerts you with the new content (email, telegram)
8
8
 
9
- # Installation
9
+
10
+ ## Installation
10
11
 
11
12
  ```shell
12
13
 
@@ -29,7 +30,7 @@ class SomeSimpleSite < Site::SimpleString
29
30
  # Implement this function, to return what you want to compare every run
30
31
  def get_content
31
32
  res = ""
32
- @parsed_content.css("div.shop-main a").map do |a|
33
+ @parsed_html.css("div.shop-main a").map do |a|
33
34
  url = "https://somesimplesite.com/shop/#{a['href']}"
34
35
  if a.css('img')[0]['src'] == "soldout.png"
35
36
  next
@@ -81,7 +82,7 @@ Run the cron often:
81
82
  */5 * * * * cd /home/poil/my_fav_scripts/; ruby dsl.rb
82
83
  ```
83
84
 
84
- # Supported websites
85
+ ## Supported websites
85
86
 
86
87
  List of sites that are somewhat maintained are [listed here](https://github.com/conchyliculture/webwatchr).
87
88
 
@@ -92,6 +93,18 @@ Some examples:
92
93
  * Package tracking (DHL, Colissimo, i-parcel, Royalmail, PostNL, UPS, USPS, etc.)
93
94
 
94
95
 
96
+ ## Command line options
97
+
98
+ From `--help`:
99
+
100
+ ```
101
+ Usage: ruby /home/renzokuken/scripts/webwatchr/lib/webwatchr/main.rb
102
+ -s, --site=SITE Run Webwatchr on one site only. It has to be the name of the class for that site.
103
+ -v, --verbose Be verbose (output to STDOUT instead of logfile
104
+ -t, --test Check website (ignoring wait time) and show what we've parsed
105
+ -h, --help Prints this help
106
+ ```
107
+
95
108
  ## Force a site check, ignoring the 'wait' parameter
96
109
 
97
110
  This can be useful to run a site update at a specific time/day with a crontab, instead of every specified amount of time. You can force update a website using the -s flag:
@@ -99,24 +112,44 @@ This can be useful to run a site update at a specific time/day with a crontab, i
99
112
  ruby webwatchr.rb -t -s SiteClass
100
113
  ```
101
114
 
102
- # FAQ
103
- ## Tests?
115
+ ## FAQ
116
+ ### Tests?
104
117
 
105
- There are like like, two!
118
+ There are like like, two!
106
119
 
107
120
  Run `rake`
108
121
 
109
- ## Logs ?
122
+ ### Logs ?
110
123
 
111
124
  Call `logger`, as you would a classic `Logger` object in your `mysite.rb`.
112
125
 
113
- ## Alerting
126
+ ### Alerting
114
127
 
115
128
  Email is the main method of alerting, but you can also set webwatchr to talk to you on Telegram through a bot.
116
129
 
117
- ### Email
130
+ #### Email
131
+
132
+ In your Main block, add
133
+
134
+ ```ruby
135
+ add_default_alert :email do
136
+ set :smtp_port, 25
137
+ set :smtp_server, "localhost"
138
+ set :from_addr, "webwatchr@domain.eu"
139
+ set :dest_addr, "admin@domain.eu"
140
+ end
141
+ ```
142
+
143
+ #### Telegram
118
144
 
119
145
  First make a bot and grab a token following the [Telegram procedure](https://core.telegram.org/bots#6-botfather).
120
146
 
121
147
  You also need to know the `chat_id` for its discussion with you. The code in [there](https://github.com/atipugin/telegram-bot-ruby/blob/master/examples/bot.rb) can help you.
122
148
 
149
+ then in your Main block, add
150
+
151
+ ```ruby
152
+ add_default_alert :telegram do
153
+ set :token, "12345:LONGTOKEN09876543"
154
+ set :chat_id, 1234567890
155
+ end
@@ -10,12 +10,12 @@ class BandcampMerch < Site::Articles
10
10
  self
11
11
  end
12
12
 
13
- def get_content()
14
- if @html_content =~ /You are being redirected, please follow <a href="([^"]+)"/
13
+ def extract_articles()
14
+ if @website_html =~ /You are being redirected, please follow <a href="([^"]+)"/
15
15
  new_url = ::Regexp.last_match(1)
16
- @html_content = Net::HTTP.get(URI.parse(new_url))
17
- @parsed_content = Nokogiri::HTML.parse(@html_content)
18
- item = @parsed_content.css('div#merch-item')
16
+ @website_html = Net::HTTP.get(URI.parse(new_url))
17
+ @parsed_html = Nokogiri::HTML.parse(@website_html)
18
+ item = @parsed_html.css('div#merch-item')
19
19
  if item.css(".notable").text == "Sold Out"
20
20
  logger.debug "That item is sold out =("
21
21
  return
@@ -30,11 +30,7 @@ class BandcampMerch < Site::Articles
30
30
  "title" => title
31
31
  })
32
32
  else
33
- f = File.new("/tmp/b", "w")
34
- f.write(@html_content)
35
- f.close
36
-
37
- @parsed_content.css('ol.merch-grid li').each do |xx|
33
+ @parsed_html.css('ol.merch-grid li').each do |xx|
38
34
  unless xx.css('p.sold-out').empty?
39
35
  logger.debug "That item is sold out =("
40
36
  next
data/lib/sites/bsky.rb CHANGED
@@ -107,14 +107,11 @@ class BskyAccount < BskyBase
107
107
  did = _profile_to_did(@account)
108
108
  path = "/xrpc/app.bsky.feed.getAuthorFeed?actor=#{did}&filter=posts_and_author_threads&limit=30"
109
109
  resp = _api_get(path)
110
- @parsed_content = JSON.parse(resp.body)
111
- f = File.open("/tmp/qsd", 'w')
112
- f.write(resp.body)
113
- f.close
110
+ @parsed_json = JSON.parse(resp.body)
114
111
  end
115
112
 
116
- def get_content
117
- @parsed_content['feed'].each do |p|
113
+ def extract_articles
114
+ @parsed_json['feed'].each do |p|
118
115
  post = p['post']
119
116
  text = post['record']['text']
120
117
  next if @regex and (text !~ @regex)
@@ -165,11 +162,11 @@ class BskySearch < BskyBase
165
162
 
166
163
  params = { "q" => "#danemark", "limit" => 30, "sort" => "top" }
167
164
  resp = _api_get("/xrpc/app.bsky.feed.searchPosts", params: params.to_a, headers: headers)
168
- @parsed_content = JSON.parse(resp.body)
165
+ @parsed_json = JSON.parse(resp.body)
169
166
  end
170
167
 
171
- def get_content
172
- @parsed_content['posts'].each do |post|
168
+ def extract_articles
169
+ @parsed_json['posts'].each do |post|
173
170
  add_article(_article_from_post(post))
174
171
  end
175
172
  end
data/lib/sites/postch.rb CHANGED
@@ -16,6 +16,7 @@ class PostCH < Site::SimpleString
16
16
  @mechanize = Mechanize.new()
17
17
  @mechanize.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0'
18
18
  @text_messages = {}
19
+ @parsed_json = nil
19
20
  end
20
21
 
21
22
  def code_to_message(code)
@@ -69,11 +70,11 @@ class PostCH < Site::SimpleString
69
70
  resp = @mechanize.get("https://service.post.ch/ekp-web/api/shipment/id/#{identity}/events", nil, nil, headers)
70
71
 
71
72
  json_content = JSON.parse(resp.body)
72
- @parsed_content = []
73
+ @parsed_json = []
73
74
 
74
75
  json_content.each do |event|
75
76
  event['description'] = code_to_message(event['eventCode'])
76
- @parsed_content << event
77
+ @parsed_json << event
77
78
  end
78
79
  end
79
80
 
@@ -89,7 +90,7 @@ class PostCH < Site::SimpleString
89
90
  end
90
91
 
91
92
  def get_content()
92
- evs = @parsed_content.map { |e|
93
+ evs = @parsed_json.map { |e|
93
94
  e['timestamp'] = DateTime.strptime(e['timestamp'], "%Y-%m-%dT%H:%M:%S%Z")
94
95
  e
95
96
  }
@@ -0,0 +1,54 @@
1
+ require_relative "../webwatchr/site"
2
+
3
+ # example:
4
+ #
5
+ # update PostNL do
6
+ # track_id "XX102917683NL"
7
+ # end
8
+
9
+ class PostNL < Site::SimpleString
10
+ require "net/http"
11
+ require "json"
12
+
13
+ def track_id(track_id)
14
+ # Sets the Track ID & URL
15
+ @track_id = track_id
16
+ @url = "https://www.postnl.post/track?barcodes=#{track_id}"
17
+ self
18
+ end
19
+
20
+ def initialize
21
+ super()
22
+ @parsed_json = nil
23
+ end
24
+
25
+ def pull_things()
26
+ resp = Net::HTTP.post(URI.parse("https://postnl.post/api/v1/auth/token"), nil, nil)
27
+ token = JSON.parse(resp.body)["access_token"]
28
+
29
+ resp = Net::HTTP.post(
30
+ URI.parse("https://postnl.post/api/v1/tracking-items"), {
31
+ "items" => ["CH188699083NL"],
32
+ "language_code" => "en"
33
+ }.to_json,
34
+ {
35
+ 'Content-Type' => 'application/json',
36
+ 'Authorization' => "Bearer #{token}"
37
+ }
38
+ )
39
+ @parsed_json = JSON.parse(resp.body)
40
+ end
41
+
42
+ def extract_content()
43
+ res = []
44
+ @parsed_json['data']['items'][0]['events'].each do |event|
45
+ msg = "#{event['datetime_local']}: #{event['status_description']}"
46
+ if event['country_code']
47
+ msg << " (#{event['country_code']})"
48
+ end
49
+ res << msg
50
+ end
51
+
52
+ return ResultObject.new(res.join(""))
53
+ end
54
+ end
@@ -9,8 +9,8 @@ class Songkick < Site::Articles
9
9
  return self
10
10
  end
11
11
 
12
- def get_content()
13
- events = @parsed_content.css('ol.event-listings')[0]
12
+ def extract_articles()
13
+ events = @parsed_html.css('ol.event-listings')[0]
14
14
  events.css('li').each do |event|
15
15
  j = JSON.parse(event.css('script')[0].text)[0]
16
16
  date = j["startDate"]
@@ -20,20 +20,20 @@ module Webwatchr
20
20
  OptionParser.new { |o|
21
21
  o.banner = "WebWatchr is a script to poll websites and alert on changes.
22
22
  Exemple uses:
23
- * Updates all webpages according to their 'wait' value, and compare against internal state, and update it.
23
+ * Updates all registered Sites, and compare against internal state, and update it.
24
24
  ruby #{__FILE__}
25
- * Updates sites-available/site.rb, ignoring 'wait' value, and compare against internal state, and update it.
26
- ruby #{__FILE__} -s site.rb
25
+ * Updates one specific Site, lue, and compare against internal state, and update it.
26
+ ruby #{__FILE__} -s SiteClass
27
27
 
28
28
  Usage: ruby #{__FILE__} "
29
- o.on("-sSITE", "--site=SITE", "Run WebWatcher on one site only. It has to be the name of the class for that site.") do |val|
29
+ o.on("-sSITE", "--site=SITE", "Run Webwatchr on one site only. It has to be the name of the class for that site.") do |val|
30
30
  PARAMS[:site] = val
31
31
  PARAMS[:mode] = :single
32
32
  end
33
33
  o.on("-v", "--verbose", "Be verbose (output to STDOUT instead of logfile") do
34
34
  PARAMS[:verbose] = true
35
35
  end
36
- o.on("-t", "--test", "Check website and return what we've parsed") do
36
+ o.on("-t", "--test", "Check website (ignoring wait time) and show what we've parsed") do
37
37
  PARAMS[:test] = true
38
38
  end
39
39
  o.on("-h", "--help", "Prints this help") {
@@ -6,6 +6,22 @@ require "net/http"
6
6
  require "nokogiri"
7
7
  require_relative "./logger"
8
8
 
9
+ # Base class for a Site to be watched
10
+ #
11
+ # Handles pulling data from websites as well as storing the state and when to update next.
12
+ #
13
+ # == Overview
14
+ #
15
+ # - update() is called, which loads the saved state file
16
+ # - do_stuff() is called and checks whether or not we should update (aka: if the last time was long enough ago)
17
+ # - if it is time, we call pull_things(), which can be overloaded, but by default just ;
18
+ # - fetches @url, and stores it in @website_html
19
+ # - parses @website_html, with Nokogiri, into @parsed_html
20
+ # - calls extract_content(), which is the method that extract what we are interested in the webpage.
21
+ # Its results will get compared with the previous execution's results.
22
+ # This is the one you should reimplement at the very least (unless you want to compare against the whole HTML body).
23
+ # - get_diff() is the method that will do the comparison, and its return value, if not nil, will trigger alerting
24
+ # - Each Alerter object in @alerters will be called, if needed.
9
25
  class Site
10
26
  include Loggable
11
27
  class ParseError < StandardError
@@ -17,7 +33,7 @@ class Site
17
33
  HTML_HEADER = "<!DOCTYPE html>\n<meta charset=\"utf-8\">\n".freeze
18
34
  DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'.freeze
19
35
 
20
- attr_accessor :url, :alerters, :rand_sleep, :every, :lastdir, :cache_dir, :state_file, :comment
36
+ attr_accessor :url, :alerters, :rand_sleep, :update_interval, :lastdir, :cache_dir, :state_file, :comment
21
37
 
22
38
  attr_writer :name
23
39
 
@@ -55,7 +71,11 @@ class Site
55
71
  @http_ver = 1
56
72
  @rand_sleep = 0
57
73
  @did_stuff = false
58
- @every = 3600
74
+ @update_interval = 3600
75
+ end
76
+
77
+ def display_optional_state
78
+ puts "We parsed the website and extracted content #{@content}"
59
79
  end
60
80
 
61
81
  def set_http_header(key, value)
@@ -85,18 +105,21 @@ class Site
85
105
  end
86
106
 
87
107
  def generate_html_content()
88
- return nil unless @content
108
+ raise StandardError, "We called generate_html_content, but there is no @content" unless @content
89
109
 
90
110
  message_html = Site::HTML_HEADER.dup
91
111
  message_html += @content
92
112
  return message_html
93
113
  end
94
114
 
95
- # Helper methods to generate Telegram content
115
+ # Helper methods to generate Telegram messages
96
116
  def generate_telegram_message_pieces()
117
+ raise StandardError, "We called generate_telegram_message_pieces, but there is no @content" unless @content
118
+
97
119
  return [@content]
98
120
  end
99
121
 
122
+ # Uses Curb to query websites with HTTP/2
100
123
  def fetch_url2(url)
101
124
  require "curb"
102
125
 
@@ -153,7 +176,7 @@ class Site
153
176
  end
154
177
  response = http.request(req)
155
178
  case response.code
156
- when "301", "302"
179
+ when "301", "302", "303"
157
180
  if max_redir == 0
158
181
  raise Site::RedirectError
159
182
  end
@@ -193,10 +216,6 @@ class Site
193
216
  return html
194
217
  end
195
218
 
196
- def parse_content(html)
197
- return parse_noko(html)
198
- end
199
-
200
219
  def parse_noko(html)
201
220
  noko = Nokogiri::HTML(html)
202
221
  meta = noko.css("meta")
@@ -224,13 +243,9 @@ class Site
224
243
  end
225
244
  end
226
245
 
246
+ # Takes the old state file, and updates it with the values passed in hash
227
247
  def update_state_file(hash)
228
248
  previous_state = load_state_file()
229
- previous_state.update({
230
- "time" => Time.now.to_i,
231
- "url" => @url,
232
- "wait" => @wait
233
- })
234
249
  state = previous_state.update(hash)
235
250
  save_state_file(state)
236
251
  end
@@ -238,20 +253,10 @@ class Site
238
253
  def alert()
239
254
  logger.debug "Alerting new stuff"
240
255
  @alerters.each do |alerter|
241
- alerter.alert(self) unless @alert_only.include?(alerter.class::IDENTIFIER)
242
- end
243
- end
244
-
245
- def content()
246
- unless @did_stuff
247
- raise StandardError, 'Trying to access @content, but we have not pulled any data yet'
256
+ if @alert_only.empty? or @alert_only.include?(alerter.class::IDENTIFIER)
257
+ alerter.alert(self)
258
+ end
248
259
  end
249
-
250
- return @content
251
- end
252
-
253
- def get_content()
254
- return @html_content
255
260
  end
256
261
 
257
262
  def alert_only(alerter_identifiers)
@@ -262,14 +267,11 @@ class Site
262
267
  else
263
268
  raise StandardError, "unknown type of provided alerter identifier #{alerter_identifiers}"
264
269
  end
270
+ self
265
271
  end
266
272
 
267
- def should_update?(prevous_time)
268
- return Time.now().to_i >= prevous_time + @wait
269
- end
270
-
271
- def get_new(_previous_content = nil)
272
- @content = get_content()
273
+ # This method compares the previous stored content, with the new one, and returns what is new.
274
+ def get_diff()
273
275
  return @content
274
276
  end
275
277
 
@@ -279,81 +281,83 @@ class Site
279
281
  md5 = Digest::MD5.hexdigest(@url)
280
282
  @cache_dir = File.join(cache_dir, "cache-#{URI.parse(@url).hostname}-#{md5}")
281
283
  @state_file = File.join(last_dir, "last-#{URI.parse(@url).hostname}-#{md5}")
282
- state = load_state_file()
283
- @wait = @every || state["wait"] || 60 * 60
284
284
  @test = test
285
285
  logger.debug "using #{@state_file} to store updates, and #{@cache_dir} for Cache"
286
286
 
287
287
  do_stuff()
288
288
  rescue Site::RedirectError
289
289
  msg = "Error parsing page #{@url}, too many redirects"
290
- msg += ". Will retry in #{@wait} + 30 minutes"
290
+ msg += ". Will retry in #{@update_interval} + 30 minutes"
291
291
  logger.error msg
292
292
  warn msg
293
- update_state_file({ "wait" => @wait + 30 * 60 })
293
+ update_state_file({ "time" => Time.now.to_i, "wait_at_least" => @update_interval + 30 * 60 })
294
294
  rescue Site::ParseError => e
295
295
  msg = "Error parsing page #{@url}"
296
296
  if e.message
297
297
  msg += " with error : #{e.message}"
298
298
  end
299
- msg += ". Will retry in #{@wait} + 30 minutes"
299
+ msg += ". Will retry in #{@update_interval} + 30 minutes"
300
300
  logger.error msg
301
301
  warn msg
302
- update_state_file({ "wait" => @wait + 30 * 60 })
302
+ update_state_file({ "time" => Time.now.to_i, "wait_at_least" => @update_interval + 30 * 60 })
303
303
  rescue Errno::ECONNREFUSED, Net::ReadTimeout, OpenSSL::SSL::SSLError, Net::OpenTimeout => e
304
304
  msg = "Network error on #{@url}"
305
305
  if e.message
306
306
  msg += " : #{e.message}"
307
307
  end
308
- msg += ". Will retry in #{@wait} + 30 minutes"
308
+ msg += ". Will retry in #{@update_interval} + 30 minutes"
309
309
  logger.error msg
310
310
  warn msg
311
- update_state_file({ "wait" => @wait + 30 * 60 })
311
+ update_state_file({ "time" => Time.now.to_i, "wait_at_least" => @update_interval + 30 * 60 })
312
+ end
313
+
314
+ def extract_content()
315
+ return @website_html
312
316
  end
313
317
 
318
+ # By default, we pull html from the @url, we parse it with Nokogiri
314
319
  def pull_things()
315
- @html_content = fetch_url(@url)
316
- @parsed_content = parse_content(@html_content)
320
+ @website_html = fetch_url(@url)
321
+ @parsed_html = parse_noko(@website_html)
322
+ @content = extract_content()
317
323
  end
318
324
 
319
325
  def do_stuff()
320
- new_stuff = false
326
+ # Prepare previous_state, with defaults, that can be overriden with what we may find in the state_file
321
327
  previous_state = {
322
328
  "time" => -9_999_999_999_999,
323
329
  "content" => nil
324
330
  }
325
- state = load_state_file()
326
- if state
327
- previous_state.update(state)
331
+ old_state = load_state_file()
332
+ delay_between_updates = old_state["wait_at_least"] || @update_interval || 60
333
+ if old_state
334
+ previous_state.update(old_state)
328
335
  end
329
- previous_content = previous_state["content"]
330
- if should_update?(previous_state["time"]) or @test
336
+
337
+ if @test or (Time.now().to_i >= previous_state['time'] + delay_between_updates)
331
338
  if @rand_sleep > 0 and not @test
332
339
  logger.info "Time to update #{@url} (sleeping #{@rand_sleep} sec)"
333
340
  sleep(@rand_sleep)
334
341
  else
335
342
  logger.info "Time to update #{@url}"
336
343
  end
344
+
337
345
  pull_things()
338
- new_stuff = get_new(previous_content)
346
+
347
+ new_stuff = get_diff()
339
348
  @did_stuff = true
340
349
  if new_stuff
341
350
  if @test
342
351
  logger.info "Would have alerted with new stuff:\n#{new_stuff}"
343
352
  else
344
353
  alert()
345
- update_state_file({
346
- "content" => new_stuff,
347
- "previous_content" => previous_content
348
- })
349
354
  end
350
355
  else
351
356
  logger.info "Nothing new for #{@url}"
352
357
  if @test
353
- logger.info "Current state is still :\n#{@content}"
358
+ display_optional_state()
354
359
  end
355
360
  end
356
- update_state_file({}) unless @test
357
361
  else
358
362
  @did_stuff = true
359
363
  logger.info "Too soon to update #{@url}"
@@ -397,15 +401,18 @@ class Site
397
401
  end
398
402
  end
399
403
 
400
- def get_new(previous_content = nil)
401
- # Is a ResultObject
402
- if @content
403
- raise StandardError, "The result of get_content() should be a ResultObject if the Site class is SimpleString" unless @content.class < ResultObject
404
- else
405
- @content = get_content()
406
- end
404
+ def get_diff()
405
+ @content ||= extract_content()
406
+ previous_content = load_state_file()["content"]
407
407
  return nil if @content == previous_content
408
408
 
409
+ update_state_file(
410
+ {
411
+ "time" => Time.now.to_i,
412
+ "wait_at_least" => @update_interval,
413
+ "content" => @content
414
+ }
415
+ )
409
416
  return @content
410
417
  end
411
418
 
@@ -426,87 +433,118 @@ class Site
426
433
  end
427
434
  end
428
435
 
429
- class DiffString < SimpleString
430
- begin
431
- require "diffy"
432
-
433
- def generate_html_content()
434
- diff_html = Site::HTML_HEADER.dup
435
- diff_html += "<head><style>"
436
- diff_html += Diffy::CSS
437
- diff_html += "</style><body>"
438
- diff_html += @diffed.to_s(:html)
439
- diff_html += "</body></html>"
440
- return diff_html
441
- end
442
-
443
- def get_differ(previous, new)
444
- return Diffy::Diff.new(previous, new)
445
- end
446
- rescue LoadError
447
- require "test/unit/diff"
448
- def generate_html_content()
449
- diff_html = Site::HTML_HEADER.dup
450
- diff_html += @diffed.to_s
451
- diff_html += "</body></html>"
452
- return diff_html
453
- end
454
-
455
- def get_differ(previous, new)
456
- return new unless previous
457
-
458
- return Test::Unit::Diff.unified(previous, new)
459
- end
436
+ ## For use when you want to parse a site, and are only interested is having
437
+ # a nice looking "Diff" between the new and the previous state
438
+ # class DiffString < SimpleString
439
+ # begin
440
+ # require "diffy"
441
+ #
442
+ # def generate_html_content()
443
+ # diff_html = Site::HTML_HEADER.dup
444
+ # diff_html += "<head><style>"
445
+ # diff_html += Diffy::CSS
446
+ # diff_html += "</style><body>"
447
+ # diff_html += @diffed.to_s(:html)
448
+ # diff_html += "</body></html>"
449
+ # return diff_html
450
+ # end
451
+ #
452
+ # def get_differ(previous, new)
453
+ # return Diffy::Diff.new(previous, new)
454
+ # end
455
+ # rescue LoadError
456
+ # require "test/unit/diff"
457
+ # def generate_html_content()
458
+ # diff_html = Site::HTML_HEADER.dup
459
+ # diff_html += @diffed.to_s
460
+ # diff_html += "</body></html>"
461
+ # return diff_html
462
+ # end
463
+ #
464
+ # def get_differ(previous, new)
465
+ # return new unless previous
466
+ #
467
+ # return Test::Unit::Diff.unified(previous, new)
468
+ # end
469
+ # end
470
+ #
471
+ # def get_diff()
472
+ # new_stuff = nil
473
+ # @content = extract_content()
474
+ # unless @content
475
+ # return nil
476
+ # end
477
+ #
478
+ # if @content != previous_content
479
+ # @diffed = get_differ(previous_content, @content)
480
+ # new_stuff = @diffed.to_s
481
+ # end
482
+ # return new_stuff
483
+ # end
484
+ # end
485
+
486
+ ## For use when you want to parse a site that has Articles
487
+ # And you want to know when knew, previously unseen Articles appear.
488
+ # For example, a shop.
489
+ #
490
+ # You need to make sure to call add_article() with instances of Article.
491
+ class Articles < Site
492
+ class Article < Hash
460
493
  end
461
494
 
462
- def get_new(previous_content = nil)
463
- new_stuff = nil
464
- @content = get_content()
465
- unless @content
466
- return nil
467
- end
495
+ def initialize
496
+ super
497
+ @articles = []
498
+ @found_articles = 0
499
+ end
468
500
 
469
- if @content != previous_content
470
- @diffed = get_differ(previous_content, @content)
471
- new_stuff = @diffed.to_s
472
- end
473
- return new_stuff
501
+ def content
502
+ log.error("Do not use site.content on an instance of Site::Articles in #{caller}")
503
+ return @articles
474
504
  end
475
- end
476
505
 
477
- class Articles < Site
478
- def initialize
479
- super
480
- @content = []
506
+ def display_optional_state
507
+ puts "We parsed the website and extracted #{@found_articles} articles"
481
508
  end
482
509
 
483
- def validate(item)
484
- raise StandardError, "Needs at least \"id\" key" unless item["id"]
510
+ def validate(article)
511
+ id = article['id']
512
+ raise StandardError, "Article needs an \"id\", which is used as identifier" unless id
485
513
 
486
- id = item["id"]
487
514
  raise StandardError, "\"id\" key needs to be a String and not #{id.class}" unless id.is_a?(String)
488
515
  end
489
516
 
490
- def add_article(item)
491
- logger.debug "Found article #{item['id']}"
492
- validate(item)
493
- item["_timestamp"] = Time.now().to_i
494
- @content << item unless @content.map { |x| x['id'] }.include?(item['id'])
517
+ def add_article(article)
518
+ logger.debug "Found article #{article['id']}"
519
+ @found_articles += 1
520
+ validate(article)
521
+ article['_timestamp'] = Time.now().to_i
522
+ @articles << article unless @articles.map { |art| art['id'] }.include?(article['id'])
523
+ end
524
+
525
+ def extract_articles()
526
+ raise StandardError, "Please implement extract_articles(). Use @parsed_html and call add_article()."
495
527
  end
496
528
 
497
- def get_new(previous_content)
498
- new_stuff = []
499
- get_content()
500
- unless @content
529
+ def get_diff()
530
+ extract_articles()
531
+ unless @articles
501
532
  return nil
502
533
  end
503
534
 
504
- if previous_content
505
- previous_ids = previous_content.map { |h| h["id"] }
506
- new_stuff = @content.delete_if { |item| previous_ids.include?(item["id"]) }
507
- else
508
- new_stuff = @content
535
+ new_stuff = @articles
536
+ previous_articles = load_state_file()["articles"]
537
+ if previous_articles
538
+ previous_ids = previous_articles.map { |art| art['id'] }
539
+ new_stuff = @articles.delete_if { |article| previous_ids.include?(article['id']) }
509
540
  end
541
+ update_state_file(
542
+ {
543
+ "time" => Time.now.to_i,
544
+ "wait_at_least" => @update_interval,
545
+ "articles" => (previous_articles || []).concat(@articles)
546
+ }
547
+ )
510
548
  if (not new_stuff) or new_stuff.empty?
511
549
  return nil
512
550
  end
@@ -514,37 +552,28 @@ class Site
514
552
  return new_stuff
515
553
  end
516
554
 
555
+ # Here we want to store every article we ever found
517
556
  def update_state_file(hash)
518
- hash_content = hash["content"]
519
- hash.delete("content")
520
557
  previous_state = load_state_file()
521
- previous_state.update({
522
- "time" => Time.now.to_i,
523
- "url" => @url,
524
- "wait" => @wait
525
- })
526
558
  state = previous_state.update(hash)
527
- if hash_content
528
- (previous_state["content"] ||= []).concat(hash_content)
529
- end
530
559
  save_state_file(state)
531
560
  end
532
561
 
533
562
  def generate_html_content()
534
563
  message_html = Site::HTML_HEADER.dup
535
564
  message_html << "<ul style='list-style-type: none;'>\n"
536
- @content.each do |item|
537
- msg = "<li id='#{item['id']}'>"
538
- if item["url"]
539
- msg += "<a href='#{item['url']}'>"
565
+ @articles.each do |article|
566
+ msg = "<li id='#{article['id']}'>"
567
+ if article['url']
568
+ msg += "<a href='#{article['url']}'>"
540
569
  end
541
- if item["img_src"]
542
- msg += "<img style='width:100px' src='#{item['img_src']}'/>"
570
+ if article["img_src"]
571
+ msg += "<img style='width:100px' src='#{article['img_src']}'/>"
543
572
  end
544
- if item["title"]
545
- msg += item['title'].to_s
573
+ if article["title"]
574
+ msg += article['title'].to_s
546
575
  end
547
- if item["url"]
576
+ if article["url"]
548
577
  msg += "</a>"
549
578
  end
550
579
  msg += "</li>\n"
@@ -556,16 +585,16 @@ class Site
556
585
 
557
586
  def generate_telegram_message_pieces()
558
587
  msg_pieces = []
559
- @content.each do |item|
560
- line = item["title"]
561
- if item["url"]
588
+ @articles.each do |article|
589
+ line = article["title"]
590
+ if article["url"]
562
591
  if line
563
- line += ": #{item['url']}"
592
+ line += ": #{article['url']}"
564
593
  else
565
- line = item["url"]
594
+ line = article["url"]
566
595
  end
567
596
 
568
- line += ": #{item['url']}"
597
+ line += ": #{article['url']}"
569
598
  end
570
599
  msg_pieces << line
571
600
  end
data/tests/helpers.rb CHANGED
@@ -27,6 +27,12 @@ class TestAlerter < Webwatchr::Alerting::Base
27
27
  end
28
28
 
29
29
  def alert(site)
30
- @result = site.content
30
+ if site.is_a?(Site::Articles)
31
+ @result = site.articles
32
+ elsif site.is_a?(Site::SimpleString)
33
+ @result = site.content
34
+ else
35
+ raise StandardError, "Unknown Site class being tests: #{site.class}"
36
+ end
31
37
  end
32
38
  end
data/tests/infra_test.rb CHANGED
@@ -45,6 +45,10 @@ class BaseWebrickTest < Test::Unit::TestCase
45
45
  restart_webrick()
46
46
  end
47
47
 
48
+ def cleanup
49
+ FileUtils.remove_entry_secure(@workdir) if File.directory?(@workdir)
50
+ end
51
+
48
52
  def teardown()
49
53
  @webrick.stop
50
54
  @serv_thread.join
@@ -53,7 +57,6 @@ class BaseWebrickTest < Test::Unit::TestCase
53
57
  f.puts ""
54
58
  end
55
59
  end
56
- FileUtils.remove_entry_secure(@workdir)
57
60
  end
58
61
  end
59
62
 
@@ -61,11 +64,11 @@ class TestSimpleStringSite < BaseWebrickTest
61
64
  class TestStringSite < Site::SimpleString
62
65
  def initialize
63
66
  super()
64
- @wait = 3600
67
+ @update_interval = 200
65
68
  end
66
69
 
67
- def get_content()
68
- return ResultObject.new(@parsed_content.css("div.content").text)
70
+ def extract_content()
71
+ return ResultObject.new(@parsed_html.css("div.content").text)
69
72
  end
70
73
  end
71
74
 
@@ -77,31 +80,27 @@ class TestSimpleStringSite < BaseWebrickTest
77
80
  f.write whole_html
78
81
  end
79
82
  url = "http://localhost:#{TEST_CONFIG[:wwwport]}/#{TEST_CONFIG[:content_is_string_file]}"
80
- wait = 10 * 60
81
83
 
82
84
  c = TestStringSite.new
83
85
  c.url = url
84
86
  a = TestAlerter.new()
85
87
  c.alerters = [a]
86
88
  assert { c.load_state_file() == {} }
87
- assert { c.should_update?(-9_999_999_999_999) }
88
- assert { c.should_update?((Time.now() - wait + 30).to_i) == false }
89
89
  html = c.fetch_url(url)
90
90
  assert { whole_html == html }
91
91
  assert { c.parse_noko(html).css("title").text == "test" }
92
92
  cache_dir = File.join(@workdir, "cache")
93
93
  last_dir = File.join(@workdir, ".lasts")
94
+ c.state_file = File.join(last_dir, "last-localhost-2182cd5c8685baed48f692ed72d7a89f")
94
95
  FileUtils.mkdir_p(cache_dir)
95
96
  FileUtils.mkdir_p(last_dir)
96
97
  c.update(cache_dir: cache_dir, last_dir: last_dir)
97
- assert { c.state_file.end_with?("last-localhost-2182cd5c8685baed48f692ed72d7a89f") }
98
98
  expected_error = "DEBUG -- TestSimpleStringSite::TestStringSite: Alerting new stuff"
99
99
  last_error = @logger_test_io.string.split("\n")[-1]
100
100
  assert { last_error.end_with?(expected_error) }
101
101
  first_pass_content = Site::HTML_HEADER + content_html
102
- assert { c.content.to_html == content_html }
103
102
  assert { c.generate_html_content == first_pass_content }
104
- assert { a.result == c.content }
103
+ assert { a.result.message == c.content.message }
105
104
 
106
105
  File.open(File.join(TEST_CONFIG[:wwwroot], TEST_CONFIG[:content_is_string_file]), "w+") do |f|
107
106
  f.write whole_html.gsub("</div>", " new ! </div>")
@@ -121,7 +120,7 @@ class TestSimpleStringSite < BaseWebrickTest
121
120
  assert { c.generate_html_content.nil? }
122
121
  assert { c.name == url }
123
122
 
124
- c.every = 0
123
+ c.update_state_file({ "time" => Time.now.to_i - 300 })
125
124
  c.update(cache_dir: cache_dir, last_dir: last_dir)
126
125
  expected_error = "DEBUG -- TestSimpleStringSite::TestStringSite: Alerting new stuff"
127
126
  last_error = @logger_test_io.string.split("\n")[-1]
@@ -131,9 +130,10 @@ class TestSimpleStringSite < BaseWebrickTest
131
130
  assert { c.name == url }
132
131
  result_last = JSON.parse(File.read(c.state_file), create_additions: true)
133
132
  result_last.delete("time")
134
- assert { result_last["url"] == url }
135
133
  assert { result_last["content"].message == "#{content_html} new ! " }
136
- assert { result_last["wait"] == 0 }
134
+ assert { result_last["wait_at_least"] == 200 }
135
+ ensure
136
+ cleanup
137
137
  end
138
138
  end
139
139
 
@@ -141,14 +141,14 @@ class TestArraySites < BaseWebrickTest
141
141
  class TestArraySite < Site::Articles
142
142
  def initialize
143
143
  super()
144
- @wait = 3600
144
+ @update_interval = 200
145
145
  end
146
146
 
147
- def get_content()
147
+ def extract_articles()
148
148
  res = []
149
- @parsed_content.css("div").each do |x|
149
+ @parsed_html.css("div").each do |x|
150
150
  a, b = x.text.split("-").map(&:strip)
151
- add_article({ "id" => a, "url" => a, "title" => b })
151
+ add_article(Article["id" => a, "url" => a, "title" => b])
152
152
  end
153
153
  return res
154
154
  end
@@ -160,15 +160,12 @@ class TestArraySites < BaseWebrickTest
160
160
  f.write whole_html
161
161
  end
162
162
  url = "http://localhost:#{TEST_CONFIG[:wwwport]}/#{TEST_CONFIG[:content_is_array_file]}"
163
- wait = 10 * 60
164
163
 
165
164
  c = TestArraySite.new
166
165
  c.url = url
167
166
  a = TestAlerter.new()
168
167
  c.alerters = [a]
169
168
  assert { c.load_state_file() == {} }
170
- assert { c.should_update?(-9_999_999_999_999) }
171
- assert { !c.should_update?((Time.now() - wait + 30).to_i) }
172
169
  html = c.fetch_url(url)
173
170
  assert { html == whole_html }
174
171
  assert { c.parse_noko(html).css("title").text == "test" }
@@ -189,17 +186,15 @@ class TestArraySites < BaseWebrickTest
189
186
  "<li id='fi'><a href='fi'>fu</a></li>",
190
187
  "</ul>"
191
188
  ].join("\n")
192
- c.content.each { |x| x.delete('_timestamp') }
189
+ c.articles.each { |x| x.delete('_timestamp') }
193
190
  assert {
194
- c.content == [
191
+ c.articles == [
195
192
  { "id" => "lol", "url" => "lol", "title" => "lilo" },
196
193
  { "id" => "fi", "url" => "fi", "title" => "fu" }
197
194
  ]
198
195
  }
199
196
  assert { c.generate_html_content == expected_html }
200
197
 
201
- result = ""
202
-
203
198
  File.open(File.join(TEST_CONFIG[:wwwroot], TEST_CONFIG[:content_is_array_file]), "a+") do |f|
204
199
  f.write "<div>new! - new </div>"
205
200
  end
@@ -207,46 +202,39 @@ class TestArraySites < BaseWebrickTest
207
202
  c.url = url
208
203
  a = TestAlerter.new()
209
204
  c.alerters = [a]
210
- # Second run don't d anything because we shouldn't rerun
205
+ # Second run don't do anything because we shouldn't rerun
211
206
  c.update(cache_dir: cache_dir, last_dir: last_dir)
212
207
  expected_error = "INFO -- TestArraySites::TestArraySite: Too soon to update #{url}"
213
208
  last_error = @logger_test_io.string.split("\n")[-1]
214
209
  assert { last_error.end_with?(expected_error) }
215
- assert { result == "" }
216
210
 
217
- result = ""
218
-
219
- c.content.each { |x| x.delete('_timestamp') }
211
+ c.update_state_file({ "time" => Time.now.to_i - 300 })
220
212
 
221
- c.every = 0
222
213
  # This time we set new things, and wait is 0 so we are good to go
223
214
  c.update(cache_dir: cache_dir, last_dir: last_dir)
224
215
  expected_error = "DEBUG -- TestArraySites::TestArraySite: Alerting new stuff"
225
216
  last_error = @logger_test_io.string.split("\n")[-1]
226
217
  assert { last_error.end_with?(expected_error) }
218
+
227
219
  expected_html = Site::HTML_HEADER.dup + [
228
220
  "<ul style='list-style-type: none;'>",
229
221
  "<li id='new!'><a href='new!'>new</a></li>",
230
222
  "</ul>"
231
223
  ].join("\n")
232
224
 
233
- c.content.each { |x| x.delete('_timestamp') }
234
- assert { c.content == [{ "id" => "new!", "url" => "new!", "title" => "new" }] }
225
+ c.articles.each { |x| x.delete('_timestamp') }
226
+ assert { c.articles == [{ "id" => "new!", "url" => "new!", "title" => "new" }] }
235
227
  assert { c.generate_html_content == expected_html }
236
- expected_last = { "url" => "http://localhost:#{TEST_CONFIG[:wwwport]}/#{TEST_CONFIG[:content_is_array_file]}",
237
- "previous_content" => [{ "id" => "lol", "url" => "lol", "title" => "lilo" },
238
- { "id" => "fi", "url" => "fi", "title" => "fu" }],
239
- "wait" => 0,
240
- "content" => [{ "id" => "lol", "title" => "lilo", "url" => "lol" },
241
- { "id" => "fi", "title" => "fu", "url" => "fi" },
242
- { "id" => "new!", "title" => "new", "url" => "new!" }] }
228
+ expected_last = {
229
+ "wait_at_least" => 200,
230
+ "articles" => [{ "id" => "lol", "title" => "lilo", "url" => "lol" },
231
+ { "id" => "fi", "title" => "fu", "url" => "fi" },
232
+ { "id" => "new!", "title" => "new", "url" => "new!" }]
233
+ }
243
234
  result_last = JSON.parse(File.read(c.state_file))
244
235
  result_last.delete("time")
245
- result_last["content"].each do |item|
246
- item.delete("_timestamp")
247
- end
248
- result_last["previous_content"].each do |item|
249
- item.delete("_timestamp")
236
+ result_last["articles"].each do |article|
237
+ article.delete("_timestamp")
250
238
  end
251
239
  assert { expected_last == result_last }
252
240
 
@@ -256,8 +244,9 @@ class TestArraySites < BaseWebrickTest
256
244
  c.url = url
257
245
  a = TestAlerter.new()
258
246
  c.alerters = [a]
259
- c.every = 0
260
- # Now, we don't call the alert Proc because we have no new things
247
+ # Now, we don't call the alerters because we have no new things
248
+ c.state_file = File.join(last_dir, "last-localhost-35e711989b197f20f3d4936e91a2c079")
249
+ c.update_state_file({ "time" => Time.now.to_i - 300 })
261
250
  c.update(cache_dir: cache_dir, last_dir: last_dir)
262
251
  expected_error = "INFO -- TestArraySites::TestArraySite: Nothing new for #{url}"
263
252
  last_error = @logger_test_io.string.split("\n")[-1]
@@ -267,5 +256,7 @@ class TestArraySites < BaseWebrickTest
267
256
  "</ul>"
268
257
  ].join("\n")
269
258
  assert { result == "" }
259
+ ensure
260
+ cleanup
270
261
  end
271
262
  end
data/webwatchr.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "webwatchr"
3
- s.version = "0.0.2"
3
+ s.version = "0.0.3"
4
4
  s.summary = "Scrapes stuff and tells you of updates."
5
5
  s.description = "Scrapes stuff and tells you of updates. Exciting!"
6
6
  s.authors = ["Renzo"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webwatchr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Renzo
@@ -22,6 +22,7 @@ files:
22
22
  - lib/sites/bandcamp.rb
23
23
  - lib/sites/bsky.rb
24
24
  - lib/sites/postch.rb
25
+ - lib/sites/postnl.rb
25
26
  - lib/sites/songkick.rb
26
27
  - lib/webwatchr.rb
27
28
  - lib/webwatchr/alerting.rb