webwatchr 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,122 @@
1
+ # WebWatchr
2
+
3
+ Silly script to periodically check webpage changes.
4
+
5
+ 1. run script every minute from cron
6
+ 2. pulls data for every Website to check, if the last time we did that is long ago
7
+ 4. if content is different, from the last time, alerts you with the new content (email, telegram)
8
+
9
+ # Installation
10
+
11
+ ```shell
12
+
13
+ $ gem install webwatchr
14
+
15
+ # if you want fancier Diffs, for DiffString objects, apt install ruby-diffy
16
+ ```
17
+
18
+ And then make your own `dsl.rb` script. Example:
19
+
20
+ ```ruby
21
+ require "webwatchr"
22
+
23
+ class SomeSimpleSite < Site::SimpleString
24
+ def initialize()
25
+ @url = "https://somesimplesite.com/shops"
26
+ super()
27
+ end
28
+
29
+ # Implement this function, to return what you want to compare every run
30
+ def get_content
31
+ res = ""
32
+ @parsed_content.css("div.shop-main a").map do |a|
33
+ url = "https://somesimplesite.com/shop/#{a['href']}"
34
+ if a.css('img')[0]['src'] == "soldout.png"
35
+ next
36
+ end
37
+
38
+ res << "#{url}\n"
39
+ end
40
+ res == "" ? nil : res
41
+ end
42
+ end
43
+
44
+ Webwatchr::Main.new do
45
+ # Some configuration, first for the alerting
46
+
47
+ # Send emails
48
+ add_default_alert :email do
49
+ set :smtp_port, 25
50
+ set :smtp_server, "localhost"
51
+ set :from_addr, "webwatchr@domain.eu"
52
+ set :dest_addr, "admin@domain.eu"
53
+ end
54
+
55
+ # Use telegram bot to send you messages
56
+ add_default_alert :telegram do
57
+ set :token, "12345:LONGTOKEN09876543"
58
+ set :chat_id, 1234567890
59
+ end
60
+
61
+ # # Just outputs update to the terminal
62
+ # add_default_alert :stdout
63
+
64
+ update BskySearch do
65
+ set "username", "toto"
66
+ set "password", "toto"
67
+ keyword "#danemark"
68
+ end
69
+
70
+ update SomeSimpleSite
71
+
72
+ update PostCH do
73
+ track_id "LS123476US"
74
+ end
75
+ end
76
+ ```
77
+
78
+ Run the cron often:
79
+
80
+ ```
81
+ */5 * * * * cd /home/poil/my_fav_scripts/; ruby dsl.rb
82
+ ```
83
+
84
+ # Supported websites
85
+
86
+ List of sites that are somewhat maintained are [listed here](https://github.com/conchyliculture/webwatchr).
87
+
88
+ Some examples:
89
+
90
+ * Bluesky
91
+ * Bandcamp merch pages
92
+ * Package tracking (DHL, Colissimo, i-parcel, Royalmail, PostNL, UPS, USPS, etc.)
93
+
94
+
95
+ ## Force a site check, ignoring the 'wait' parameter
96
+
97
+ This can be useful to run a site update at a specific time/day with a crontab, instead of every specified amount of time. You can force update a website using the -s flag:
98
+ ```bash
99
+ ruby webwatchr.rb -t -s SiteClass
100
+ ```
101
+
102
+ # FAQ
103
+ ## Tests?
104
+
105
+ There are like like, two!
106
+
107
+ Run `rake`
108
+
109
+ ## Logs ?
110
+
111
+ Call `logger`, as you would a classic `Logger` object in your `mysite.rb`.
112
+
113
+ ## Alerting
114
+
115
+ Email is the main method of alerting, but you can also set webwatchr to talk to you on Telegram through a bot.
116
+
117
+ ### Email
118
+
119
+ First make a bot and grab a token following the [Telegram procedure](https://core.telegram.org/bots#6-botfather).
120
+
121
+ You also need to know the `chat_id` for its discussion with you. The code in [there](https://github.com/atipugin/telegram-bot-ruby/blob/master/examples/bot.rb) can help you.
122
+
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ task default: %w[infra_tests sites_tests]
2
+
3
+ task :infra_tests do
4
+ ruby "tests/infra_test.rb"
5
+ end
6
+
7
+ task :sites_tests do
8
+ Dir.glob("tests/sites/*.rb").each do |t|
9
+ ruby t
10
+ end
11
+ end
@@ -0,0 +1,60 @@
1
+ require_relative "../webwatchr/site"
2
+
3
+ class BandcampMerch < Site::Articles
4
+ require "net/http"
5
+ require "nokogiri"
6
+
7
+ def band(value)
8
+ @band = value
9
+ @url = "https://#{@band}.bandcamp.com/merch"
10
+ self
11
+ end
12
+
13
+ def get_content()
14
+ if @html_content =~ /You are being redirected, please follow <a href="([^"]+)"/
15
+ new_url = ::Regexp.last_match(1)
16
+ @html_content = Net::HTTP.get(URI.parse(new_url))
17
+ @parsed_content = Nokogiri::HTML.parse(@html_content)
18
+ item = @parsed_content.css('div#merch-item')
19
+ if item.css(".notable").text == "Sold Out"
20
+ logger.debug "That item is sold out =("
21
+ return
22
+ end
23
+ url = new_url
24
+ title = item.css("h2.title").text
25
+ img_url = item.css("img.main-art").attr("src").text
26
+ add_article({
27
+ "id" => url,
28
+ "url" => url,
29
+ "img_src" => img_url,
30
+ "title" => title
31
+ })
32
+ else
33
+ f = File.new("/tmp/b", "w")
34
+ f.write(@html_content)
35
+ f.close
36
+
37
+ @parsed_content.css('ol.merch-grid li').each do |xx|
38
+ unless xx.css('p.sold-out').empty?
39
+ logger.debug "That item is sold out =("
40
+ next
41
+ end
42
+ x = xx.css('a')
43
+ url = "http://#{URI.parse(@url).host + x.attr('href').text}"
44
+ img = x.css('img')
45
+ img_url = img.attr('src').text
46
+ if img_url =~ /\/img\/43.gif/
47
+ img_url = img.attr('data-original')
48
+ end
49
+ title = x.css('p.title').text.strip().gsub(/ *\n */, '')
50
+ price = x.css('span.price').text
51
+ add_article({
52
+ "id" => url,
53
+ "url" => url,
54
+ "img_src" => img_url,
55
+ "title" => "#{title} #{price}"
56
+ })
57
+ end
58
+ end
59
+ end
60
+ end
data/lib/sites/bsky.rb ADDED
@@ -0,0 +1,176 @@
1
+ require_relative "../webwatchr/site"
2
+ require "mechanize"
3
+
4
+ class BskyBase < Site::Articles
5
+ API_PUBLIC_HOSTS = [
6
+ 'public.api.bsky.app'
7
+ ].freeze
8
+ API_PRIVATE_HOSTS = [
9
+ 'scalycap.us-west.host.bsky.network'
10
+ #'oysterling.us-west.host.bsky.network'
11
+ # 'amanita.us-east.host.bsky.network'
12
+ ].freeze
13
+
14
+ def _get_bearer()
15
+ url = "https://bsky.social/xrpc/com.atproto.server.createSession"
16
+ data = {
17
+ "identifier" => @username,
18
+ "password" => @password,
19
+ "authFactorToken" => "",
20
+ "allowTakendown" => true
21
+ }
22
+ headers = {
23
+ "content-type" => "application/json"
24
+ }
25
+ resp = @mechanize.post(url, data.to_json, headers)
26
+ j = JSON.parse(resp.body)
27
+ begin
28
+ return j['accessJwt']
29
+ rescue StandardError => e
30
+ raise Site::ParseError "Error while logging in #{e}"
31
+ end
32
+ end
33
+
34
+ def _api_get(path, params: [], headers: nil)
35
+ api_hosts = @bearer ? API_PRIVATE_HOSTS : API_PUBLIC_HOSTS
36
+ api_host = api_hosts.sample
37
+ url = "https://#{api_host}#{path}"
38
+ resp = @mechanize.get(url, params, nil, headers)
39
+ return resp
40
+ end
41
+
42
+ def _profile_to_did(account)
43
+ path = "/xrpc/com.atproto.identity.resolveHandle?handle=#{account}"
44
+ resp = _api_get(path)
45
+ return JSON.parse(resp.body)['did']
46
+ end
47
+
48
+ def _article_from_post(post)
49
+ post_id = post["uri"].split("/")[-1]
50
+ text = post['record']['text']
51
+ art = {
52
+ "id" => post["uri"],
53
+ "url" => "https://bsky.app/profile/#{@account}/post/#{post_id}",
54
+ "title" => "#{post['record']['createdAt']}: #{text}"
55
+ }
56
+
57
+ if post["embed"]
58
+ case post["embed"]["$type"]
59
+ when "app.bsky.embed.images#view"
60
+ images = post["embed"]["images"].sort_by { |image| (image["aspectRatio"] || { 'height' => 0 })["height"] }
61
+ case images.size
62
+ when 0
63
+ # noop
64
+ when 1
65
+ art['img_src'] = images[0]["thumb"]
66
+ else
67
+ art['img_src'] = images[1]["thumb"]
68
+ end
69
+ end
70
+ end
71
+
72
+ return art
73
+ end
74
+ end
75
+
76
+ class BskyAccount < BskyBase
77
+ # Parses Bluesky "profile" pages and returns every post
78
+ #
79
+ # ==== Examples
80
+ #
81
+ # Webwatchr::Main.new do
82
+ # update BskyAccount do
83
+ # account "theonion.com"
84
+ # # Optional settings
85
+ # set "reposts", false # Disable reposts
86
+ # set "regex", /fun/ # Only posts where the text matches the regex
87
+ # end
88
+ # ....
89
+ # end
90
+
91
+ attr_accessor :reposts, :regex
92
+
93
+ def account(account)
94
+ @account = account
95
+ @url = "https://bsky.app/profile/#{account}"
96
+ self
97
+ end
98
+
99
+ def initialize
100
+ super()
101
+ @reposts = true
102
+ @mechanize = Mechanize.new()
103
+ @json_results_key = "feed"
104
+ end
105
+
106
+ def pull_things
107
+ did = _profile_to_did(@account)
108
+ path = "/xrpc/app.bsky.feed.getAuthorFeed?actor=#{did}&filter=posts_and_author_threads&limit=30"
109
+ resp = _api_get(path)
110
+ @parsed_content = JSON.parse(resp.body)
111
+ f = File.open("/tmp/qsd", 'w')
112
+ f.write(resp.body)
113
+ f.close
114
+ end
115
+
116
+ def get_content
117
+ @parsed_content['feed'].each do |p|
118
+ post = p['post']
119
+ text = post['record']['text']
120
+ next if @regex and (text !~ @regex)
121
+
122
+ next if !@reposts && (post['author']['handle'] != @account)
123
+
124
+ art = _article_from_post(post)
125
+ add_article(art)
126
+ end
127
+ end
128
+ end
129
+
130
+ class BskySearch < BskyBase
131
+ # Runs a search on Bluesky for a keyword
132
+ #
133
+ # ==== Examples
134
+ #
135
+ # Webwatchr::Main.new do
136
+ # update BskyAccount do
137
+ # keyword "#danemark"
138
+ # # Mandatory settings, you need to be logged in to search
139
+ # set "username", "username"
140
+ # set "password", "password"
141
+ # end
142
+ # ....
143
+ # end
144
+ attr_accessor :username, :password
145
+
146
+ def keyword(keyword)
147
+ @keyword = keyword
148
+ @url = "https://bsky.app/search?#{keyword}"
149
+ self
150
+ end
151
+
152
+ def initialize
153
+ super()
154
+ @mechanize = Mechanize.new()
155
+ @json_results_key = "posts"
156
+ end
157
+
158
+ def pull_things
159
+ raise StandardError, 'Need both username & password to run searches' unless @password and @username
160
+
161
+ @bearer ||= _get_bearer()
162
+ headers = {
163
+ "authorization" => "Bearer #{@bearer}"
164
+ }
165
+
166
+ params = { "q" => "#danemark", "limit" => 30, "sort" => "top" }
167
+ resp = _api_get("/xrpc/app.bsky.feed.searchPosts", params: params.to_a, headers: headers)
168
+ @parsed_content = JSON.parse(resp.body)
169
+ end
170
+
171
+ def get_content
172
+ @parsed_content['posts'].each do |post|
173
+ add_article(_article_from_post(post))
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,112 @@
1
+ require_relative "../webwatchr/site"
2
+ require "json"
3
+ require "mechanize"
4
+
5
+ class PostCH < Site::SimpleString
6
+ def track_id(track_id)
7
+ # Sets the Track ID & URL
8
+ @track_id = track_id
9
+ @url = "https://www.post.ch/api/TrackAndTrace/Get?sc_lang=en&id=#{@track_id}"
10
+ self
11
+ end
12
+
13
+ def initialize
14
+ super()
15
+ @events = []
16
+ @mechanize = Mechanize.new()
17
+ @mechanize.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0'
18
+ @text_messages = {}
19
+ end
20
+
21
+ def code_to_message(code)
22
+ @text_messages = JSON.parse(@mechanize.get("https://service.post.ch/ekp-web/core/rest/translations/en/shipment-text-messages").body)['shipment-text--'] if @text_messages == {}
23
+ @text_messages.each do |k, v|
24
+ ccode = code.split('.')
25
+ kk = k.split('.')
26
+ 0.upto(ccode.size()) do |i|
27
+ c = ccode[i]
28
+ e = kk[i]
29
+ if c.nil? and i == kk.size - 1
30
+ return v
31
+ end
32
+ next if e == "*"
33
+ next if c == e
34
+
35
+ break
36
+ end
37
+ end
38
+ return code
39
+ end
40
+
41
+ def pull_things()
42
+ # First we need an anonymous userId
43
+ resp = @mechanize.get("https://service.post.ch/ekp-web/api/user", nil, nil, { 'accept' => 'application/json' })
44
+ user_id = JSON.parse(resp.body)['userIdentifier']
45
+ csrf_token = resp.header["x-csrf-token"]
46
+
47
+ headers = {
48
+ 'accept' => 'application/json, text/plain, */*',
49
+ 'Accept-Encoding' => 'gzip,deflate,br,zstd',
50
+ 'accept-language' => 'en',
51
+ 'Cache-Control' => 'no-cache',
52
+ 'Content-Type' => 'application/json',
53
+ 'Origin' => 'https://service.post.ch',
54
+ 'Referer' => 'https://service.post.ch',
55
+ 'X-Csrf-Token' => csrf_token
56
+ }
57
+
58
+ resp = @mechanize.post("https://service.post.ch/ekp-web/api/history?userId=#{user_id}", { 'searchQuery' => @track_id }.to_json, headers)
59
+ hash = JSON.parse(resp.body)['hash']
60
+
61
+ resp = @mechanize.get("https://service.post.ch/ekp-web/api/history/not-included/#{hash}?userId=#{user_id}", nil, nil, headers)
62
+ jresp = JSON.parse(resp.body)
63
+ if jresp.empty?
64
+ raise Site::ParseError, "No result for #{@track_id}. Check that it is valid."
65
+ end
66
+
67
+ identity = jresp[0]['identity']
68
+
69
+ resp = @mechanize.get("https://service.post.ch/ekp-web/api/shipment/id/#{identity}/events", nil, nil, headers)
70
+
71
+ json_content = JSON.parse(resp.body)
72
+ @parsed_content = []
73
+
74
+ json_content.each do |event|
75
+ event['description'] = code_to_message(event['eventCode'])
76
+ @parsed_content << event
77
+ end
78
+ end
79
+
80
+ def get_html_content()
81
+ res = []
82
+ res << Site::HTML_HEADER
83
+ res << "<ul>"
84
+ @events.each do |e|
85
+ res << "<li>#{e}</li>"
86
+ end
87
+ res << "</ul>"
88
+ return res.join("\n")
89
+ end
90
+
91
+ def get_content()
92
+ evs = @parsed_content.map { |e|
93
+ e['timestamp'] = DateTime.strptime(e['timestamp'], "%Y-%m-%dT%H:%M:%S%Z")
94
+ e
95
+ }
96
+ evs.sort { |a, b| a['timestamp'] == b['timestamp'] ? a['description'] <=> b['description'] : a['timestamp'] <=> b['timestamp'] }.reverse.each do |event|
97
+ msg = "#{event['timestamp']}: #{event['description']}"
98
+ if event['city'] and event['city'] != ""
99
+ msg += " (#{event['city']} #{event['zip']})"
100
+ end
101
+ @events << msg
102
+ end
103
+
104
+ return ResultObject.new(@events.join("\n"))
105
+ end
106
+ end
107
+
108
+ # Example:
109
+ #
110
+ # update PostCH do
111
+ # track_id "LS203038460CH"
112
+ # end
@@ -0,0 +1,28 @@
1
+ require_relative "../webwatchr/site"
2
+
3
+ class Songkick < Site::Articles
4
+ def full_url(url)
5
+ @url = url
6
+ unless @url.end_with?("/calendar")
7
+ logger.warn("Songkick should end with /calendar to get all concerts")
8
+ end
9
+ return self
10
+ end
11
+
12
+ def get_content()
13
+ events = @parsed_content.css('ol.event-listings')[0]
14
+ events.css('li').each do |event|
15
+ j = JSON.parse(event.css('script')[0].text)[0]
16
+ date = j["startDate"]
17
+ url = j["url"]
18
+ artist = j["name"]
19
+ loc = j["location"]
20
+ location = "#{loc['name']} #{loc['address']['addressLocality']}, #{loc['address']['addressCountry']}"
21
+ add_article({
22
+ "id" => url,
23
+ "url" => url,
24
+ "title" => "#{date}: #{artist} at #{location}"
25
+ })
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,129 @@
1
+ require "net/smtp"
2
+ require 'telegram/bot'
3
+ require_relative "./logger"
4
+
5
+ module Webwatchr
6
+ module Alerting
7
+ class Base
8
+ include Loggable
9
+ REQUIRED_SETTINGS = [].freeze
10
+ def validate
11
+ missing_settings = REQUIRED_SETTINGS - @config.to_a.select { |s| s[1] }.map { |s| s[0] }
12
+ raise StandardError, "Missing required settings for #{self.class}: #{missing_settings}" unless missing_settings.empty?
13
+ end
14
+
15
+ def self.create(&block)
16
+ if block
17
+ new.instance_eval(&block)
18
+ else
19
+ new
20
+ end
21
+ end
22
+
23
+ def alert(site)
24
+ raise StandardError, "Need to pass a Site instance" unless site
25
+
26
+ validate
27
+ end
28
+
29
+ def initialize
30
+ @config = {}
31
+ end
32
+
33
+ def set(key, val)
34
+ @config[key] = val
35
+ self
36
+ end
37
+ end
38
+
39
+ class EmailAlert < Base
40
+ REQUIRED_SETTINGS = %i[from_addr dest_addr smtp_server smtp_port].freeze
41
+ IDENTIFIER = :email
42
+ # This class will send you email if content changes.
43
+ #
44
+ # ==== Examples
45
+ #
46
+ # Webwatchr::Main.new do
47
+ # add_default_alert :email do
48
+ # set :smtp_port, 25
49
+ # set :smtp_server, "localhost"
50
+ # set :dest_addr, "dest@email.eu"
51
+ # set :from_addr, "source@email.eu"
52
+ # end
53
+ # ....
54
+ # "end"
55
+ def alert(site)
56
+ super(site)
57
+
58
+ subject = site.get_email_subject() || "Update from #{site.class}"
59
+
60
+ formatted_content = site.generate_html_content()
61
+
62
+ msgstr = <<~END_OF_MESSAGE
63
+ From: #{@config[:from_addr]}
64
+ To: #{@config[:dest_addr]}
65
+ MIME-Version: 1.0
66
+ Content-type: text/html; charset=UTF-8
67
+ Subject: [Webwatchr] #{subject}
68
+
69
+ Update from #{site.get_email_url()}
70
+
71
+ #{formatted_content}
72
+ END_OF_MESSAGE
73
+
74
+ begin
75
+ Net::SMTP.start(@config[:smtp_server], @config[:smtp_port], starttls: false) do |smtp|
76
+ raise StandardError, "from address cannot be nil" unless @config[:from_addr]
77
+
78
+ smtp.send_message(msgstr, @config[:from_addr], @config[:dest_addr])
79
+ logger.debug("Sending mail to #{@config[:dest_addr]}")
80
+ end
81
+ rescue Net::SMTPFatalError => e
82
+ logger.error "Couldn't send email from #{@config[:from_addr]} to #{@config[:dest_addr]}. #{@config[:smtp_server]}:#{@config[:smtp_port]} said #{e.message}"
83
+ end
84
+ end
85
+ end
86
+
87
+ class TelegramAlert < Base
88
+ IDENTIFIER = :telegram
89
+ REQUIRED_SETTINGS = %i[token chat_id].freeze
90
+ # This class will use a Telegram bot to send you a message if content changes.
91
+ #
92
+ # ==== Examples
93
+ #
94
+ # Webwatchr::Main.new do
95
+ # ...
96
+ # add_default_alert :telegram do
97
+ # set :token, "95123456YU:AArestoftoken"
98
+ # set :chat_id, 123456789
99
+ # end
100
+ # ....
101
+ # "end"
102
+ def alert(site)
103
+ super(site)
104
+ bot = Telegram::Bot::Client.new(@config[:token])
105
+ msg_pieces = [site.get_email_subject]
106
+ msg_pieces << site.get_email_url()
107
+
108
+ msg_pieces += site.generate_telegram_message_pieces()
109
+ msg_pieces = msg_pieces.map { |x| x.size > 4096 ? x.split("\n") : x }.flatten()
110
+ split_msg = msg_pieces.each_with_object(['']) { |str, sum|
111
+ sum.last.length + str.length > 4000 ? sum << "#{str}\n" : sum.last << "#{str}\n"
112
+ }
113
+
114
+ split_msg.each do |m|
115
+ bot.api.send_message(chat_id: @config[:chat_id], text: m)
116
+ end
117
+ end
118
+ end
119
+
120
+ class StdoutAlert < Base
121
+ IDENTIFIER = :stdout
122
+ def alert(site)
123
+ super(site)
124
+ msg = "Update rom #{site.url}\n#{site.generate_html_content}"
125
+ puts(msg)
126
+ end
127
+ end
128
+ end
129
+ end