RubyGems - webwatchr - Versions diffs - 0.0.2 → 0.0.3 - Mend

webwatchr 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1675ffe16b57de72800f93dfa9a81a83475e5f815e3a40a3df5175f7d5f0ce69
-  data.tar.gz: 1936758eb8f893715b7f91337358a0f34b3ea50ff5dca4f8d39eeb2b7119d46f
+  metadata.gz: a2ceb7b164a65a05718c637bf20288b99bd6dc71f8e048508e1e5e45db03b351
+  data.tar.gz: 7f7356e8e9806d1cca36cae95633a0de35c5ac0881061f3dd44de415144ee3c2
 SHA512:
-  metadata.gz: 48680b66395e44a11241bf873018aa447f5e2ecabca4862dfc870209dfcb04bc5c35023af7f2dde6f0129344cfc6186b12abae0b3ae23089b33ef74ee00e19a6
-  data.tar.gz: 6afdf42375ca35935340649188dc2bccbeefade1ddec65ea2c56d0f601db7d24ddf48e49251fa80a0b7739fee35ad1ee914c62883ef4b40805c68930b3368e42
+  metadata.gz: 5a905d1ec16cedc8e60aba00f1c1a898fd9282a1d5a159a44467feb11d22fa6f75c92df016d3cb01a40679ea8a91dbf5a459e9ed6592f4417ddb2ad0b6c3c113
+  data.tar.gz: 6b47cc97bc17c24b1589b1fc0ecb1ea51b6eef14e2c01a77cecc2583df6d21f6ed36cf98d98fc237cccd7909a27e56b3b5a58dc067b4c2667afe5fd912cdec8a

data/README.md CHANGED Viewed

@@ -6,7 +6,8 @@ Silly script to periodically check webpage changes.
 2. pulls data for every Website to check, if the last time we did that is long ago
 4. if content is different, from the last time, alerts you with the new content (email, telegram)
-# Installation
+## Installation
 ```shell
@@ -29,7 +30,7 @@ class SomeSimpleSite < Site::SimpleString
   # Implement this function, to return what you want to compare every run
   def get_content
     res = ""
-    @parsed_content.css("div.shop-main a").map do |a|
+    @parsed_html.css("div.shop-main a").map do |a|
       url = "https://somesimplesite.com/shop/#{a['href']}"
       if a.css('img')[0]['src'] == "soldout.png"
         next
@@ -81,7 +82,7 @@ Run the cron often:
 */5 * * * * cd /home/poil/my_fav_scripts/; ruby dsl.rb
 ```
-# Supported websites
+## Supported websites
 List of sites that are somewhat maintained are [listed here](https://github.com/conchyliculture/webwatchr).
@@ -92,6 +93,18 @@ Some examples:
 * Package tracking (DHL, Colissimo, i-parcel, Royalmail, PostNL, UPS, USPS, etc.)
+## Command line options
+From `--help`:
+```
+  Usage: ruby /home/renzokuken/scripts/webwatchr/lib/webwatchr/main.rb
+    -s, --site=SITE                  Run Webwatchr on one site only. It has to be the name of the class for that site.
+    -v, --verbose                    Be verbose (output to STDOUT instead of logfile
+    -t, --test                       Check website (ignoring wait time) and show what we've parsed
+    -h, --help                       Prints this help
+```
 ## Force a site check, ignoring the 'wait' parameter
 This can be useful to run a site update at a specific time/day with a crontab, instead of every specified amount of time. You can force update a website using the -s flag:
@@ -99,24 +112,44 @@ This can be useful to run a site update at a specific time/day with a crontab, i
 ruby webwatchr.rb -t -s SiteClass
 ```
-# FAQ
-## Tests?
+## FAQ
+### Tests?
-There are like like, two!
+There are like like, two!
 Run `rake`
-## Logs ?
+### Logs ?
 Call `logger`, as you would a classic `Logger` object in your `mysite.rb`.
-## Alerting
+### Alerting
 Email is the main method of alerting, but you can also set webwatchr to talk to you on Telegram through a bot.
-### Email
+#### Email
+In your Main block, add
+```ruby
+add_default_alert :email do
+  set :smtp_port, 25
+  set :smtp_server, "localhost"
+  set :from_addr, "webwatchr@domain.eu"
+  set :dest_addr, "admin@domain.eu"
+end
+```
+#### Telegram
 First make a bot and grab a token following the [Telegram procedure](https://core.telegram.org/bots#6-botfather).
 You also need to know the `chat_id` for its discussion with you. The code in [there](https://github.com/atipugin/telegram-bot-ruby/blob/master/examples/bot.rb) can help you.
+then in your Main block, add
+```ruby
+add_default_alert :telegram do
+  set :token, "12345:LONGTOKEN09876543"
+  set :chat_id, 1234567890
+end

data/lib/sites/bandcamp.rb CHANGED Viewed

@@ -10,12 +10,12 @@ class BandcampMerch < Site::Articles
     self
   end
-  def get_content()
-    if @html_content =~ /You are being redirected, please follow <a href="([^"]+)"/
+  def extract_articles()
+    if @website_html =~ /You are being redirected, please follow <a href="([^"]+)"/
       new_url = ::Regexp.last_match(1)
-      @html_content = Net::HTTP.get(URI.parse(new_url))
-      @parsed_content = Nokogiri::HTML.parse(@html_content)
-      item = @parsed_content.css('div#merch-item')
+      @website_html = Net::HTTP.get(URI.parse(new_url))
+      @parsed_html = Nokogiri::HTML.parse(@website_html)
+      item = @parsed_html.css('div#merch-item')
       if item.css(".notable").text == "Sold Out"
         logger.debug "That item is sold out =("
         return
@@ -30,11 +30,7 @@ class BandcampMerch < Site::Articles
                     "title" => title
                   })
     else
-      f = File.new("/tmp/b", "w")
-      f.write(@html_content)
-      f.close
-      @parsed_content.css('ol.merch-grid li').each do |xx|
+      @parsed_html.css('ol.merch-grid li').each do |xx|
         unless xx.css('p.sold-out').empty?
           logger.debug "That item is sold out =("
           next

data/lib/sites/bsky.rb CHANGED Viewed

@@ -107,14 +107,11 @@ class BskyAccount < BskyBase
     did = _profile_to_did(@account)
     path = "/xrpc/app.bsky.feed.getAuthorFeed?actor=#{did}&filter=posts_and_author_threads&limit=30"
     resp = _api_get(path)
-    @parsed_content = JSON.parse(resp.body)
-    f = File.open("/tmp/qsd", 'w')
-    f.write(resp.body)
-    f.close
+    @parsed_json = JSON.parse(resp.body)
   end
-  def get_content
-    @parsed_content['feed'].each do |p|
+  def extract_articles
+    @parsed_json['feed'].each do |p|
       post = p['post']
       text = post['record']['text']
       next if @regex and (text !~ @regex)
@@ -165,11 +162,11 @@ class BskySearch < BskyBase
     params = { "q" => "#danemark", "limit" => 30, "sort" => "top" }
     resp = _api_get("/xrpc/app.bsky.feed.searchPosts", params: params.to_a, headers: headers)
-    @parsed_content = JSON.parse(resp.body)
+    @parsed_json = JSON.parse(resp.body)
   end
-  def get_content
-    @parsed_content['posts'].each do |post|
+  def extract_articles
+    @parsed_json['posts'].each do |post|
       add_article(_article_from_post(post))
     end
   end

data/lib/sites/postch.rb CHANGED Viewed

@@ -16,6 +16,7 @@ class PostCH < Site::SimpleString
     @mechanize = Mechanize.new()
     @mechanize.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0'
     @text_messages = {}
+    @parsed_json = nil
   end
   def code_to_message(code)
@@ -69,11 +70,11 @@ class PostCH < Site::SimpleString
     resp = @mechanize.get("https://service.post.ch/ekp-web/api/shipment/id/#{identity}/events", nil, nil, headers)
     json_content = JSON.parse(resp.body)
-    @parsed_content = []
+    @parsed_json = []
     json_content.each do |event|
       event['description'] = code_to_message(event['eventCode'])
-      @parsed_content << event
+      @parsed_json << event
     end
   end
@@ -89,7 +90,7 @@ class PostCH < Site::SimpleString
   end
   def get_content()
-    evs = @parsed_content.map { |e|
+    evs = @parsed_json.map { |e|
       e['timestamp'] = DateTime.strptime(e['timestamp'], "%Y-%m-%dT%H:%M:%S%Z")
       e
     }

data/lib/sites/postnl.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require_relative "../webwatchr/site"
+# example:
+#
+#  update PostNL do
+#    track_id "XX102917683NL"
+#  end
+class PostNL < Site::SimpleString
+  require "net/http"
+  require "json"
+  def track_id(track_id)
+    # Sets the Track ID & URL
+    @track_id = track_id
+    @url = "https://www.postnl.post/track?barcodes=#{track_id}"
+    self
+  end
+  def initialize
+    super()
+    @parsed_json = nil
+  end
+  def pull_things()
+    resp = Net::HTTP.post(URI.parse("https://postnl.post/api/v1/auth/token"), nil, nil)
+    token = JSON.parse(resp.body)["access_token"]
+    resp = Net::HTTP.post(
+      URI.parse("https://postnl.post/api/v1/tracking-items"), {
+        "items" => ["CH188699083NL"],
+        "language_code" => "en"
+      }.to_json,
+      {
+        'Content-Type' => 'application/json',
+        'Authorization' => "Bearer #{token}"
+      }
+    )
+    @parsed_json = JSON.parse(resp.body)
+  end
+  def extract_content()
+    res = []
+    @parsed_json['data']['items'][0]['events'].each do |event|
+      msg = "#{event['datetime_local']}: #{event['status_description']}"
+      if event['country_code']
+        msg << " (#{event['country_code']})"
+      end
+      res << msg
+    end
+    return ResultObject.new(res.join(""))
+  end
+end

data/lib/sites/songkick.rb CHANGED Viewed

@@ -9,8 +9,8 @@ class Songkick < Site::Articles
     return self
   end
-  def get_content()
-    events = @parsed_content.css('ol.event-listings')[0]
+  def extract_articles()
+    events = @parsed_html.css('ol.event-listings')[0]
     events.css('li').each do |event|
       j = JSON.parse(event.css('script')[0].text)[0]
       date = j["startDate"]

data/lib/webwatchr/main.rb CHANGED Viewed

@@ -20,20 +20,20 @@ module Webwatchr
     OptionParser.new { |o|
       o.banner = "WebWatchr is a script to poll websites and alert on changes.
   Exemple uses:
-   * Updates all webpages according to their 'wait' value, and compare against internal state, and update it.
+   * Updates all registered Sites, and compare against internal state, and update it.
       ruby #{__FILE__}
-   * Updates sites-available/site.rb, ignoring 'wait' value, and compare against internal state, and update it.
-      ruby #{__FILE__} -s site.rb
+   * Updates one specific Site, lue, and compare against internal state, and update it.
+      ruby #{__FILE__} -s SiteClass
   Usage: ruby #{__FILE__} "
-      o.on("-sSITE", "--site=SITE", "Run WebWatcher on one site only. It has to be the name of the class for that site.") do |val|
+      o.on("-sSITE", "--site=SITE", "Run Webwatchr on one site only. It has to be the name of the class for that site.") do |val|
         PARAMS[:site] = val
         PARAMS[:mode] = :single
       end
       o.on("-v", "--verbose", "Be verbose (output to STDOUT instead of logfile") do
         PARAMS[:verbose] = true
       end
-      o.on("-t", "--test", "Check website and return what we've parsed") do
+      o.on("-t", "--test", "Check website (ignoring wait time) and show what we've parsed") do
         PARAMS[:test] = true
       end
       o.on("-h", "--help", "Prints this help") {

data/lib/webwatchr/site.rb CHANGED Viewed

@@ -6,6 +6,22 @@ require "net/http"
 require "nokogiri"
 require_relative "./logger"
+# Base class for a Site to be watched
+#
+# Handles pulling data from websites as well as storing the state and when to update next.
+#
+# == Overview
+#
+# - update() is called, which loads the saved state file
+# - do_stuff() is called and checks whether or not we should update (aka: if the last time was long enough ago)
+# - if it is time, we call pull_things(), which can be overloaded, but by default just ;
+#    - fetches @url, and stores it in @website_html
+#    - parses @website_html, with Nokogiri, into @parsed_html
+#    - calls extract_content(), which is the method that extract what we are interested in the webpage.
+#     Its results will get compared with the previous execution's results.
+#     This is the one you should reimplement at the very least (unless you want to compare against the whole HTML body).
+#  - get_diff() is the method that will do the comparison, and its return value, if not nil, will trigger alerting
+#  - Each Alerter object in @alerters will be called, if needed.
 class Site
   include Loggable
   class ParseError < StandardError
@@ -17,7 +33,7 @@ class Site
   HTML_HEADER = "<!DOCTYPE html>\n<meta charset=\"utf-8\">\n".freeze
   DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'.freeze
-  attr_accessor :url, :alerters, :rand_sleep, :every, :lastdir, :cache_dir, :state_file, :comment
+  attr_accessor :url, :alerters, :rand_sleep, :update_interval, :lastdir, :cache_dir, :state_file, :comment
   attr_writer :name
@@ -55,7 +71,11 @@ class Site
     @http_ver = 1
     @rand_sleep = 0
     @did_stuff = false
-    @every = 3600
+    @update_interval = 3600
+  end
+  def display_optional_state
+    puts "We parsed the website and extracted content #{@content}"
   end
   def set_http_header(key, value)
@@ -85,18 +105,21 @@ class Site
   end
   def generate_html_content()
-    return nil unless @content
+    raise StandardError, "We called generate_html_content, but there is no @content" unless @content
     message_html = Site::HTML_HEADER.dup
     message_html += @content
     return message_html
   end
-  # Helper methods to generate Telegram content
+  # Helper methods to generate Telegram messages
   def generate_telegram_message_pieces()
+    raise StandardError, "We called generate_telegram_message_pieces, but there is no @content" unless @content
     return [@content]
   end
+  # Uses Curb to query websites with HTTP/2
   def fetch_url2(url)
     require "curb"
@@ -153,7 +176,7 @@ class Site
       end
       response = http.request(req)
       case response.code
-      when "301", "302"
+      when "301", "302", "303"
         if max_redir == 0
           raise Site::RedirectError
         end
@@ -193,10 +216,6 @@ class Site
     return html
   end
-  def parse_content(html)
-    return parse_noko(html)
-  end
   def parse_noko(html)
     noko = Nokogiri::HTML(html)
     meta = noko.css("meta")
@@ -224,13 +243,9 @@ class Site
     end
   end
+  # Takes the old state file, and updates it with the values passed in hash
   def update_state_file(hash)
     previous_state = load_state_file()
-    previous_state.update({
-                            "time" => Time.now.to_i,
-                            "url" => @url,
-                            "wait" => @wait
-                          })
     state = previous_state.update(hash)
     save_state_file(state)
   end
@@ -238,20 +253,10 @@ class Site
   def alert()
     logger.debug "Alerting new stuff"
     @alerters.each do |alerter|
-      alerter.alert(self) unless @alert_only.include?(alerter.class::IDENTIFIER)
-    end
-  end
-  def content()
-    unless @did_stuff
-      raise StandardError, 'Trying to access @content, but we have not pulled any data yet'
+      if @alert_only.empty? or @alert_only.include?(alerter.class::IDENTIFIER)
+        alerter.alert(self)
+      end
     end
-    return @content
-  end
-  def get_content()
-    return @html_content
   end
   def alert_only(alerter_identifiers)
@@ -262,14 +267,11 @@ class Site
     else
       raise StandardError, "unknown type of provided alerter identifier #{alerter_identifiers}"
     end
+    self
   end
-  def should_update?(prevous_time)
-    return Time.now().to_i >= prevous_time + @wait
-  end
-  def get_new(_previous_content = nil)
-    @content = get_content()
+  # This method compares the previous stored content, with the new one, and returns what is new.
+  def get_diff()
     return @content
   end
@@ -279,81 +281,83 @@ class Site
     md5 = Digest::MD5.hexdigest(@url)
     @cache_dir = File.join(cache_dir, "cache-#{URI.parse(@url).hostname}-#{md5}")
     @state_file = File.join(last_dir, "last-#{URI.parse(@url).hostname}-#{md5}")
-    state = load_state_file()
-    @wait = @every || state["wait"] || 60 * 60
     @test = test
     logger.debug "using #{@state_file} to store updates, and #{@cache_dir} for Cache"
     do_stuff()
   rescue Site::RedirectError
     msg = "Error parsing page #{@url}, too many redirects"
-    msg += ". Will retry in #{@wait} + 30 minutes"
+    msg += ". Will retry in #{@update_interval} + 30 minutes"
     logger.error msg
     warn msg
-    update_state_file({ "wait" => @wait + 30 * 60 })
+    update_state_file({ "time" => Time.now.to_i, "wait_at_least" => @update_interval + 30 * 60 })
   rescue Site::ParseError => e
     msg = "Error parsing page #{@url}"
     if e.message
       msg += " with error : #{e.message}"
     end
-    msg += ". Will retry in #{@wait} + 30 minutes"
+    msg += ". Will retry in #{@update_interval} + 30 minutes"
     logger.error msg
     warn msg
-    update_state_file({ "wait" => @wait + 30 * 60 })
+    update_state_file({ "time" => Time.now.to_i, "wait_at_least" => @update_interval + 30 * 60 })
   rescue Errno::ECONNREFUSED, Net::ReadTimeout, OpenSSL::SSL::SSLError, Net::OpenTimeout => e
     msg = "Network error on #{@url}"
     if e.message
       msg += " : #{e.message}"
     end
-    msg += ". Will retry in #{@wait} + 30 minutes"
+    msg += ". Will retry in #{@update_interval} + 30 minutes"
     logger.error msg
     warn msg
-    update_state_file({ "wait" => @wait + 30 * 60 })
+    update_state_file({ "time" => Time.now.to_i, "wait_at_least" => @update_interval + 30 * 60 })
+  end
+  def extract_content()
+    return @website_html
   end
+  # By default, we pull html from the @url, we parse it with Nokogiri
   def pull_things()
-    @html_content = fetch_url(@url)
-    @parsed_content = parse_content(@html_content)
+    @website_html = fetch_url(@url)
+    @parsed_html = parse_noko(@website_html)
+    @content = extract_content()
   end
   def do_stuff()
-    new_stuff = false
+    # Prepare previous_state, with defaults, that can be overriden with what we may find in the state_file
     previous_state = {
       "time" => -9_999_999_999_999,
       "content" => nil
     }
-    state = load_state_file()
-    if state
-      previous_state.update(state)
+    old_state = load_state_file()
+    delay_between_updates = old_state["wait_at_least"] || @update_interval || 60
+    if old_state
+      previous_state.update(old_state)
     end
-    previous_content = previous_state["content"]
-    if should_update?(previous_state["time"]) or @test
+    if @test or (Time.now().to_i >= previous_state['time'] + delay_between_updates)
       if @rand_sleep > 0 and not @test
         logger.info "Time to update #{@url} (sleeping #{@rand_sleep} sec)"
         sleep(@rand_sleep)
       else
         logger.info "Time to update #{@url}"
       end
       pull_things()
-      new_stuff = get_new(previous_content)
+      new_stuff = get_diff()
       @did_stuff = true
       if new_stuff
         if @test
           logger.info "Would have alerted with new stuff:\n#{new_stuff}"
         else
           alert()
-          update_state_file({
-                              "content" => new_stuff,
-                              "previous_content" => previous_content
-                            })
         end
       else
         logger.info "Nothing new for #{@url}"
         if @test
-          logger.info "Current state is still :\n#{@content}"
+          display_optional_state()
         end
       end
-      update_state_file({}) unless @test
     else
       @did_stuff = true
       logger.info "Too soon to update #{@url}"
@@ -397,15 +401,18 @@ class Site
       end
     end
-    def get_new(previous_content = nil)
-      # Is a ResultObject
-      if @content
-        raise StandardError, "The result of get_content() should be a ResultObject if the Site class is SimpleString" unless @content.class < ResultObject
-      else
-        @content = get_content()
-      end
+    def get_diff()
+      @content ||= extract_content()
+      previous_content = load_state_file()["content"]
       return nil if @content == previous_content
+      update_state_file(
+        {
+          "time" => Time.now.to_i,
+          "wait_at_least" => @update_interval,
+          "content" => @content
+        }
+      )
       return @content
     end
@@ -426,87 +433,118 @@ class Site
     end
   end
-  class DiffString < SimpleString
-    begin
-      require "diffy"
-      def generate_html_content()
-        diff_html = Site::HTML_HEADER.dup
-        diff_html += "<head><style>"
-        diff_html += Diffy::CSS
-        diff_html += "</style><body>"
-        diff_html += @diffed.to_s(:html)
-        diff_html += "</body></html>"
-        return diff_html
-      end
-      def get_differ(previous, new)
-        return Diffy::Diff.new(previous, new)
-      end
-    rescue LoadError
-      require "test/unit/diff"
-      def generate_html_content()
-        diff_html = Site::HTML_HEADER.dup
-        diff_html += @diffed.to_s
-        diff_html += "</body></html>"
-        return diff_html
-      end
-      def get_differ(previous, new)
-        return new unless previous
-        return Test::Unit::Diff.unified(previous, new)
-      end
+  ## For use when you want to parse a site, and are only interested is having
+  # a nice looking "Diff" between the new and the previous state
+  #  class DiffString < SimpleString
+  #    begin
+  #      require "diffy"
+  #
+  #      def generate_html_content()
+  #        diff_html = Site::HTML_HEADER.dup
+  #        diff_html += "<head><style>"
+  #        diff_html += Diffy::CSS
+  #        diff_html += "</style><body>"
+  #        diff_html += @diffed.to_s(:html)
+  #        diff_html += "</body></html>"
+  #        return diff_html
+  #      end
+  #
+  #      def get_differ(previous, new)
+  #        return Diffy::Diff.new(previous, new)
+  #      end
+  #    rescue LoadError
+  #      require "test/unit/diff"
+  #      def generate_html_content()
+  #        diff_html = Site::HTML_HEADER.dup
+  #        diff_html += @diffed.to_s
+  #        diff_html += "</body></html>"
+  #        return diff_html
+  #      end
+  #
+  #      def get_differ(previous, new)
+  #        return new unless previous
+  #
+  #        return Test::Unit::Diff.unified(previous, new)
+  #      end
+  #    end
+  #
+  #    def get_diff()
+  #      new_stuff = nil
+  #      @content = extract_content()
+  #      unless @content
+  #        return nil
+  #      end
+  #
+  #      if @content != previous_content
+  #        @diffed = get_differ(previous_content, @content)
+  #        new_stuff = @diffed.to_s
+  #      end
+  #      return new_stuff
+  #    end
+  #  end
+  ## For use when you want to parse a site that has Articles
+  # And you want to know when knew, previously unseen Articles appear.
+  # For example, a shop.
+  #
+  # You need to make sure to call add_article() with instances of Article.
+  class Articles < Site
+    class Article < Hash
     end
-    def get_new(previous_content = nil)
-      new_stuff = nil
-      @content = get_content()
-      unless @content
-        return nil
-      end
+    def initialize
+      super
+      @articles = []
+      @found_articles = 0
+    end
-      if @content != previous_content
-        @diffed = get_differ(previous_content, @content)
-        new_stuff = @diffed.to_s
-      end
-      return new_stuff
+    def content
+      log.error("Do not use site.content on an instance of Site::Articles in #{caller}")
+      return @articles
     end
-  end
-  class Articles < Site
-    def initialize
-      super
-      @content = []
+    def display_optional_state
+      puts "We parsed the website and extracted #{@found_articles} articles"
     end
-    def validate(item)
-      raise StandardError, "Needs at least \"id\" key" unless item["id"]
+    def validate(article)
+      id = article['id']
+      raise StandardError, "Article needs an \"id\", which is used as identifier" unless id
-      id = item["id"]
       raise StandardError, "\"id\" key needs to be a String and not #{id.class}" unless id.is_a?(String)
     end
-    def add_article(item)
-      logger.debug "Found article #{item['id']}"
-      validate(item)
-      item["_timestamp"] = Time.now().to_i
-      @content << item unless @content.map { |x| x['id'] }.include?(item['id'])
+    def add_article(article)
+      logger.debug "Found article #{article['id']}"
+      @found_articles += 1
+      validate(article)
+      article['_timestamp'] = Time.now().to_i
+      @articles << article unless @articles.map { |art| art['id'] }.include?(article['id'])
+    end
+    def extract_articles()
+      raise StandardError, "Please implement extract_articles(). Use @parsed_html and call add_article()."
     end
-    def get_new(previous_content)
-      new_stuff = []
-      get_content()
-      unless @content
+    def get_diff()
+      extract_articles()
+      unless @articles
         return nil
       end
-      if previous_content
-        previous_ids = previous_content.map { |h| h["id"] }
-        new_stuff = @content.delete_if { |item| previous_ids.include?(item["id"]) }
-      else
-        new_stuff = @content
+      new_stuff = @articles
+      previous_articles = load_state_file()["articles"]
+      if previous_articles
+        previous_ids = previous_articles.map { |art| art['id'] }
+        new_stuff = @articles.delete_if { |article| previous_ids.include?(article['id']) }
       end
+      update_state_file(
+        {
+          "time" => Time.now.to_i,
+          "wait_at_least" => @update_interval,
+          "articles" => (previous_articles || []).concat(@articles)
+        }
+      )
       if (not new_stuff) or new_stuff.empty?
         return nil
       end
@@ -514,37 +552,28 @@ class Site
       return new_stuff
     end
+    # Here we want to store every article we ever found
     def update_state_file(hash)
-      hash_content = hash["content"]
-      hash.delete("content")
       previous_state = load_state_file()
-      previous_state.update({
-                              "time" => Time.now.to_i,
-                              "url" => @url,
-                              "wait" => @wait
-                            })
       state = previous_state.update(hash)
-      if hash_content
-        (previous_state["content"] ||= []).concat(hash_content)
-      end
       save_state_file(state)
     end
     def generate_html_content()
       message_html = Site::HTML_HEADER.dup
       message_html << "<ul style='list-style-type: none;'>\n"
-      @content.each do |item|
-        msg = "<li id='#{item['id']}'>"
-        if item["url"]
-          msg += "<a href='#{item['url']}'>"
+      @articles.each do |article|
+        msg = "<li id='#{article['id']}'>"
+        if article['url']
+          msg += "<a href='#{article['url']}'>"
         end
-        if item["img_src"]
-          msg += "<img style='width:100px' src='#{item['img_src']}'/>"
+        if article["img_src"]
+          msg += "<img style='width:100px' src='#{article['img_src']}'/>"
         end
-        if item["title"]
-          msg += item['title'].to_s
+        if article["title"]
+          msg += article['title'].to_s
         end
-        if item["url"]
+        if article["url"]
           msg += "</a>"
         end
         msg += "</li>\n"
@@ -556,16 +585,16 @@ class Site
     def generate_telegram_message_pieces()
       msg_pieces = []
-      @content.each do |item|
-        line = item["title"]
-        if item["url"]
+      @articles.each do |article|
+        line = article["title"]
+        if article["url"]
           if line
-            line += ": #{item['url']}"
+            line += ": #{article['url']}"
           else
-            line = item["url"]
+            line = article["url"]
           end
-          line += ": #{item['url']}"
+          line += ": #{article['url']}"
         end
         msg_pieces << line
       end

data/tests/helpers.rb CHANGED Viewed

@@ -27,6 +27,12 @@ class TestAlerter < Webwatchr::Alerting::Base
   end
   def alert(site)
-    @result = site.content
+    if site.is_a?(Site::Articles)
+      @result = site.articles
+    elsif site.is_a?(Site::SimpleString)
+      @result = site.content
+    else
+      raise StandardError, "Unknown Site class being tests: #{site.class}"
+    end
   end
 end

data/tests/infra_test.rb CHANGED Viewed

@@ -45,6 +45,10 @@ class BaseWebrickTest < Test::Unit::TestCase
     restart_webrick()
   end
+  def cleanup
+    FileUtils.remove_entry_secure(@workdir) if File.directory?(@workdir)
+  end
   def teardown()
     @webrick.stop
     @serv_thread.join
@@ -53,7 +57,6 @@ class BaseWebrickTest < Test::Unit::TestCase
         f.puts ""
       end
     end
-    FileUtils.remove_entry_secure(@workdir)
   end
 end
@@ -61,11 +64,11 @@ class TestSimpleStringSite < BaseWebrickTest
   class TestStringSite < Site::SimpleString
     def initialize
       super()
-      @wait = 3600
+      @update_interval = 200
     end
-    def get_content()
-      return ResultObject.new(@parsed_content.css("div.content").text)
+    def extract_content()
+      return ResultObject.new(@parsed_html.css("div.content").text)
     end
   end
@@ -77,31 +80,27 @@ class TestSimpleStringSite < BaseWebrickTest
       f.write whole_html
     end
     url = "http://localhost:#{TEST_CONFIG[:wwwport]}/#{TEST_CONFIG[:content_is_string_file]}"
-    wait = 10 * 60
     c = TestStringSite.new
     c.url = url
     a = TestAlerter.new()
     c.alerters = [a]
     assert { c.load_state_file() == {} }
-    assert { c.should_update?(-9_999_999_999_999) }
-    assert { c.should_update?((Time.now() - wait + 30).to_i) == false }
     html = c.fetch_url(url)
     assert { whole_html == html }
     assert { c.parse_noko(html).css("title").text == "test" }
     cache_dir = File.join(@workdir, "cache")
     last_dir = File.join(@workdir, ".lasts")
+    c.state_file = File.join(last_dir, "last-localhost-2182cd5c8685baed48f692ed72d7a89f")
     FileUtils.mkdir_p(cache_dir)
     FileUtils.mkdir_p(last_dir)
     c.update(cache_dir: cache_dir, last_dir: last_dir)
-    assert { c.state_file.end_with?("last-localhost-2182cd5c8685baed48f692ed72d7a89f") }
     expected_error = "DEBUG -- TestSimpleStringSite::TestStringSite: Alerting new stuff"
     last_error = @logger_test_io.string.split("\n")[-1]
     assert { last_error.end_with?(expected_error) }
     first_pass_content = Site::HTML_HEADER + content_html
-    assert { c.content.to_html == content_html }
     assert { c.generate_html_content == first_pass_content }
-    assert { a.result == c.content }
+    assert { a.result.message == c.content.message }
     File.open(File.join(TEST_CONFIG[:wwwroot], TEST_CONFIG[:content_is_string_file]), "w+") do |f|
       f.write whole_html.gsub("</div>", " new ! </div>")
@@ -121,7 +120,7 @@ class TestSimpleStringSite < BaseWebrickTest
     assert { c.generate_html_content.nil? }
     assert { c.name == url }
-    c.every = 0
+    c.update_state_file({ "time" => Time.now.to_i - 300 })
     c.update(cache_dir: cache_dir, last_dir: last_dir)
     expected_error = "DEBUG -- TestSimpleStringSite::TestStringSite: Alerting new stuff"
     last_error = @logger_test_io.string.split("\n")[-1]
@@ -131,9 +130,10 @@ class TestSimpleStringSite < BaseWebrickTest
     assert { c.name == url }
     result_last = JSON.parse(File.read(c.state_file), create_additions: true)
     result_last.delete("time")
-    assert { result_last["url"] == url }
     assert { result_last["content"].message == "#{content_html} new ! " }
-    assert { result_last["wait"] == 0 }
+    assert { result_last["wait_at_least"] == 200 }
+  ensure
+    cleanup
   end
 end
@@ -141,14 +141,14 @@ class TestArraySites < BaseWebrickTest
   class TestArraySite < Site::Articles
     def initialize
       super()
-      @wait = 3600
+      @update_interval = 200
     end
-    def get_content()
+    def extract_articles()
       res = []
-      @parsed_content.css("div").each do |x|
+      @parsed_html.css("div").each do |x|
         a, b = x.text.split("-").map(&:strip)
-        add_article({ "id" => a, "url" => a, "title" => b })
+        add_article(Article["id" => a, "url" => a, "title" => b])
       end
       return res
     end
@@ -160,15 +160,12 @@ class TestArraySites < BaseWebrickTest
       f.write whole_html
     end
     url = "http://localhost:#{TEST_CONFIG[:wwwport]}/#{TEST_CONFIG[:content_is_array_file]}"
-    wait = 10 * 60
     c = TestArraySite.new
     c.url = url
     a = TestAlerter.new()
     c.alerters = [a]
     assert { c.load_state_file() == {} }
-    assert { c.should_update?(-9_999_999_999_999) }
-    assert { !c.should_update?((Time.now() - wait + 30).to_i) }
     html = c.fetch_url(url)
     assert { html == whole_html }
     assert { c.parse_noko(html).css("title").text == "test" }
@@ -189,17 +186,15 @@ class TestArraySites < BaseWebrickTest
       "<li id='fi'><a href='fi'>fu</a></li>",
       "</ul>"
     ].join("\n")
-    c.content.each { |x| x.delete('_timestamp') }
+    c.articles.each { |x| x.delete('_timestamp') }
     assert {
-      c.content == [
+      c.articles == [
         { "id" => "lol", "url" => "lol", "title" => "lilo" },
         { "id" => "fi", "url" => "fi", "title" => "fu" }
       ]
     }
     assert { c.generate_html_content == expected_html }
-    result = ""
     File.open(File.join(TEST_CONFIG[:wwwroot], TEST_CONFIG[:content_is_array_file]), "a+") do |f|
       f.write "<div>new! - new </div>"
     end
@@ -207,46 +202,39 @@ class TestArraySites < BaseWebrickTest
     c.url = url
     a = TestAlerter.new()
     c.alerters = [a]
-    # Second run don't d anything because we shouldn't rerun
+    # Second run don't do anything because we shouldn't rerun
     c.update(cache_dir: cache_dir, last_dir: last_dir)
     expected_error = "INFO -- TestArraySites::TestArraySite: Too soon to update #{url}"
     last_error = @logger_test_io.string.split("\n")[-1]
     assert { last_error.end_with?(expected_error) }
-    assert { result == "" }
-    result = ""
-    c.content.each { |x| x.delete('_timestamp') }
+    c.update_state_file({ "time" => Time.now.to_i - 300 })
-    c.every = 0
     # This time we set new things, and wait is 0 so we are good to go
     c.update(cache_dir: cache_dir, last_dir: last_dir)
     expected_error = "DEBUG -- TestArraySites::TestArraySite: Alerting new stuff"
     last_error = @logger_test_io.string.split("\n")[-1]
     assert { last_error.end_with?(expected_error) }
     expected_html = Site::HTML_HEADER.dup + [
       "<ul style='list-style-type: none;'>",
       "<li id='new!'><a href='new!'>new</a></li>",
       "</ul>"
     ].join("\n")
-    c.content.each { |x| x.delete('_timestamp') }
-    assert { c.content == [{ "id" => "new!", "url" => "new!", "title" => "new" }] }
+    c.articles.each { |x| x.delete('_timestamp') }
+    assert { c.articles == [{ "id" => "new!", "url" => "new!", "title" => "new" }] }
     assert { c.generate_html_content == expected_html }
-    expected_last = { "url" => "http://localhost:#{TEST_CONFIG[:wwwport]}/#{TEST_CONFIG[:content_is_array_file]}",
-                      "previous_content" => [{ "id" => "lol", "url" => "lol", "title" => "lilo" },
-                                             { "id" => "fi", "url" => "fi", "title" => "fu" }],
-                      "wait" => 0,
-                      "content" => [{ "id" => "lol", "title" => "lilo", "url" => "lol" },
-                                    { "id" => "fi", "title" => "fu", "url" => "fi" },
-                                    { "id" => "new!", "title" => "new", "url" => "new!" }] }
+    expected_last = {
+      "wait_at_least" => 200,
+      "articles" => [{ "id" => "lol", "title" => "lilo", "url" => "lol" },
+                     { "id" => "fi", "title" => "fu", "url" => "fi" },
+                     { "id" => "new!", "title" => "new", "url" => "new!" }]
+    }
     result_last = JSON.parse(File.read(c.state_file))
     result_last.delete("time")
-    result_last["content"].each do |item|
-      item.delete("_timestamp")
-    end
-    result_last["previous_content"].each do |item|
-      item.delete("_timestamp")
+    result_last["articles"].each do |article|
+      article.delete("_timestamp")
     end
     assert { expected_last == result_last }
@@ -256,8 +244,9 @@ class TestArraySites < BaseWebrickTest
     c.url = url
     a = TestAlerter.new()
     c.alerters = [a]
-    c.every = 0
-    # Now, we don't call the alert Proc because we have no new things
+    # Now, we don't call the alerters because we have no new things
+    c.state_file = File.join(last_dir, "last-localhost-35e711989b197f20f3d4936e91a2c079")
+    c.update_state_file({ "time" => Time.now.to_i - 300 })
     c.update(cache_dir: cache_dir, last_dir: last_dir)
     expected_error = "INFO -- TestArraySites::TestArraySite: Nothing new for #{url}"
     last_error = @logger_test_io.string.split("\n")[-1]
@@ -267,5 +256,7 @@ class TestArraySites < BaseWebrickTest
       "</ul>"
     ].join("\n")
     assert { result == "" }
+  ensure
+    cleanup
   end
 end

data/webwatchr.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name        = "webwatchr"
-  s.version     = "0.0.2"
+  s.version     = "0.0.3"
   s.summary     = "Scrapes stuff and tells you of updates."
   s.description = "Scrapes stuff and tells you of updates. Exciting!"
   s.authors     = ["Renzo"]

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: webwatchr
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Renzo
@@ -22,6 +22,7 @@ files:
 - lib/sites/bandcamp.rb
 - lib/sites/bsky.rb
 - lib/sites/postch.rb
+- lib/sites/postnl.rb
 - lib/sites/songkick.rb
 - lib/webwatchr.rb
 - lib/webwatchr/alerting.rb