heathrow 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +58 -0
  3. data/README.md +205 -0
  4. data/bin/heathrow +42 -0
  5. data/bin/heathrowd +283 -0
  6. data/docs/ARCHITECTURE.md +1172 -0
  7. data/docs/DATABASE_SCHEMA.md +685 -0
  8. data/docs/DEVELOPMENT_WORKFLOW.md +867 -0
  9. data/docs/DISCORD_SETUP.md +142 -0
  10. data/docs/GMAIL_OAUTH_SETUP.md +120 -0
  11. data/docs/PLUGIN_SYSTEM.md +1370 -0
  12. data/docs/PROJECT_PLAN.md +1022 -0
  13. data/docs/README.md +417 -0
  14. data/docs/REDDIT_SETUP.md +174 -0
  15. data/docs/REPLY_FORWARD.md +182 -0
  16. data/docs/WHATSAPP_TELEGRAM_SETUP.md +306 -0
  17. data/heathrow.gemspec +34 -0
  18. data/heathrowd.service +21 -0
  19. data/img/heathrow.svg +95 -0
  20. data/img/rss_threaded.png +0 -0
  21. data/img/sources.png +0 -0
  22. data/lib/heathrow/address_book.rb +42 -0
  23. data/lib/heathrow/config.rb +332 -0
  24. data/lib/heathrow/database.rb +731 -0
  25. data/lib/heathrow/database_new.rb +392 -0
  26. data/lib/heathrow/event_bus.rb +175 -0
  27. data/lib/heathrow/logger.rb +122 -0
  28. data/lib/heathrow/message.rb +176 -0
  29. data/lib/heathrow/message_composer.rb +399 -0
  30. data/lib/heathrow/message_organizer.rb +774 -0
  31. data/lib/heathrow/migrations/001_initial_schema.rb +248 -0
  32. data/lib/heathrow/notmuch.rb +45 -0
  33. data/lib/heathrow/oauth2_smtp.rb +254 -0
  34. data/lib/heathrow/plugin/base.rb +212 -0
  35. data/lib/heathrow/plugin_manager.rb +141 -0
  36. data/lib/heathrow/poller.rb +93 -0
  37. data/lib/heathrow/smtp_sender.rb +204 -0
  38. data/lib/heathrow/source.rb +39 -0
  39. data/lib/heathrow/sources/base.rb +74 -0
  40. data/lib/heathrow/sources/discord.rb +357 -0
  41. data/lib/heathrow/sources/gmail.rb +294 -0
  42. data/lib/heathrow/sources/imap.rb +198 -0
  43. data/lib/heathrow/sources/instagram.rb +307 -0
  44. data/lib/heathrow/sources/instagram_fetch.py +101 -0
  45. data/lib/heathrow/sources/instagram_send.py +55 -0
  46. data/lib/heathrow/sources/instagram_send_marionette.py +104 -0
  47. data/lib/heathrow/sources/maildir.rb +606 -0
  48. data/lib/heathrow/sources/messenger.rb +212 -0
  49. data/lib/heathrow/sources/messenger_fetch.js +297 -0
  50. data/lib/heathrow/sources/messenger_fetch_marionette.py +138 -0
  51. data/lib/heathrow/sources/messenger_send.js +32 -0
  52. data/lib/heathrow/sources/messenger_send.py +100 -0
  53. data/lib/heathrow/sources/reddit.rb +461 -0
  54. data/lib/heathrow/sources/rss.rb +299 -0
  55. data/lib/heathrow/sources/slack.rb +375 -0
  56. data/lib/heathrow/sources/source_manager.rb +328 -0
  57. data/lib/heathrow/sources/telegram.rb +498 -0
  58. data/lib/heathrow/sources/webpage.rb +207 -0
  59. data/lib/heathrow/sources/weechat.rb +479 -0
  60. data/lib/heathrow/sources/whatsapp.rb +474 -0
  61. data/lib/heathrow/ui/application.rb +8098 -0
  62. data/lib/heathrow/ui/navigation.rb +8 -0
  63. data/lib/heathrow/ui/panes.rb +8 -0
  64. data/lib/heathrow/ui/source_wizard.rb +567 -0
  65. data/lib/heathrow/ui/threaded_view.rb +780 -0
  66. data/lib/heathrow/ui/views.rb +8 -0
  67. data/lib/heathrow/version.rb +3 -0
  68. data/lib/heathrow/wizards/discord_wizard.rb +193 -0
  69. data/lib/heathrow/wizards/slack_wizard.rb +140 -0
  70. data/lib/heathrow.rb +55 -0
  71. metadata +147 -0
@@ -0,0 +1,498 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'net/http'
5
+ require 'json'
6
+ require 'uri'
7
+ require 'time'
8
+
9
+ module Heathrow
10
+ module Sources
11
+ class Telegram
12
+ attr_reader :source, :last_fetch_time
13
+
14
+ def initialize(source)
15
+ @source = source
16
+ @config = source.config.is_a?(String) ? JSON.parse(source.config) : source.config
17
+ @last_fetch_time = Time.now
18
+ @last_message_id = nil
19
+ end
20
+
21
+ def fetch_messages
22
+ messages = []
23
+
24
+ begin
25
+ # Use Bot API or MTProto based on configuration
26
+ if @config['bot_token']
27
+ messages = fetch_bot_messages
28
+ elsif @config['api_id'] && @config['api_hash']
29
+ messages = fetch_mtproto_messages
30
+ else
31
+ puts "Telegram: No valid credentials configured" if ENV['DEBUG']
32
+ end
33
+
34
+ rescue => e
35
+ puts "Telegram fetch error: #{e.message}" if ENV['DEBUG']
36
+ puts e.backtrace.join("\n") if ENV['DEBUG']
37
+ end
38
+
39
+ messages
40
+ end
41
+
42
+ def test_connection
43
+ begin
44
+ if @config['bot_token']
45
+ test_bot_connection
46
+ elsif @config['api_id'] && @config['api_hash']
47
+ test_mtproto_connection
48
+ else
49
+ { success: false, message: "No Telegram credentials configured" }
50
+ end
51
+ rescue => e
52
+ { success: false, message: "Connection test failed: #{e.message}" }
53
+ end
54
+ end
55
+
56
+ def authenticate
57
+ if @config['api_id'] && @config['api_hash'] && @config['phone_number']
58
+ authenticate_mtproto
59
+ else
60
+ puts "For user account access, configure api_id, api_hash, and phone_number"
61
+ puts "For bot access, configure bot_token"
62
+ false
63
+ end
64
+ end
65
+
66
+ def can_reply?
67
+ true
68
+ end
69
+
70
+ def send_message(to, subject, body, in_reply_to = nil)
71
+ if @config['bot_token']
72
+ send_bot_message(to, body, in_reply_to)
73
+ elsif @config['session_string']
74
+ send_mtproto_message(to, body, in_reply_to)
75
+ else
76
+ { success: false, message: "Telegram not configured for sending" }
77
+ end
78
+ end
79
+
80
+ private
81
+
82
+ def send_bot_message(to, body, in_reply_to = nil)
83
+ token = @config['bot_token']
84
+
85
+ # Parse recipient - could be chat ID or username
86
+ chat_id = if to =~ /^-?\d+$/
87
+ to # Already a chat ID
88
+ else
89
+ # For usernames, we'd need to look up the chat ID
90
+ # For now, require chat IDs
91
+ return { success: false, message: "Please use chat ID for Telegram messages" }
92
+ end
93
+
94
+ uri = URI("https://api.telegram.org/bot#{token}/sendMessage")
95
+
96
+ params = {
97
+ chat_id: chat_id,
98
+ text: body
99
+ }
100
+
101
+ # Add reply if specified
102
+ params[:reply_to_message_id] = in_reply_to if in_reply_to
103
+
104
+ request = Net::HTTP::Post.new(uri)
105
+ request['Content-Type'] = 'application/json'
106
+ request.body = params.to_json
107
+
108
+ begin
109
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
110
+ http.request(request)
111
+ end
112
+
113
+ if response.is_a?(Net::HTTPSuccess)
114
+ data = JSON.parse(response.body)
115
+ if data['ok']
116
+ { success: true, message: "Message sent via Telegram bot" }
117
+ else
118
+ { success: false, message: "Failed: #{data['description']}" }
119
+ end
120
+ else
121
+ { success: false, message: "HTTP error: #{response.code}" }
122
+ end
123
+ rescue => e
124
+ { success: false, message: "Send failed: #{e.message}" }
125
+ end
126
+ end
127
+
128
+ def send_mtproto_message(to, body, in_reply_to = nil)
129
+ # This would require the MTProto server
130
+ api_url = @config['mtproto_api_url'] || 'http://localhost:8081'
131
+
132
+ uri = URI("#{api_url}/send")
133
+ request = Net::HTTP::Post.new(uri)
134
+ request['Content-Type'] = 'application/json'
135
+ request.body = {
136
+ session_string: @config['session_string'],
137
+ chat_id: to,
138
+ text: body,
139
+ reply_to: in_reply_to
140
+ }.to_json
141
+
142
+ begin
143
+ response = Net::HTTP.start(uri.hostname, uri.port) do |http|
144
+ http.request(request)
145
+ end
146
+
147
+ if response.is_a?(Net::HTTPSuccess)
148
+ { success: true, message: "Message sent via Telegram" }
149
+ else
150
+ { success: false, message: "Failed to send via MTProto" }
151
+ end
152
+ rescue => e
153
+ { success: false, message: "MTProto server not available: #{e.message}" }
154
+ end
155
+ end
156
+
157
+ # Bot API Methods (simpler but limited to bot interactions)
158
+
159
+ def fetch_bot_messages
160
+ messages = []
161
+ token = @config['bot_token']
162
+
163
+ uri = URI("https://api.telegram.org/bot#{token}/getUpdates")
164
+ params = { timeout: 0, limit: @config['fetch_limit'] || 100 }
165
+
166
+ # Use offset for incremental updates
167
+ if @last_message_id
168
+ params[:offset] = @last_message_id + 1
169
+ end
170
+
171
+ uri.query = URI.encode_www_form(params)
172
+ response = Net::HTTP.get_response(uri)
173
+
174
+ if response.is_a?(Net::HTTPSuccess)
175
+ data = JSON.parse(response.body)
176
+
177
+ if data['ok'] && data['result']
178
+ data['result'].each do |update|
179
+ if update['message']
180
+ msg = convert_bot_message(update['message'])
181
+ messages << msg if msg
182
+ @last_message_id = update['update_id']
183
+ end
184
+ end
185
+ end
186
+ else
187
+ puts "Telegram Bot API error: #{response.code}" if ENV['DEBUG']
188
+ end
189
+
190
+ messages
191
+ end
192
+
193
+ def test_bot_connection
194
+ token = @config['bot_token']
195
+ uri = URI("https://api.telegram.org/bot#{token}/getMe")
196
+ response = Net::HTTP.get_response(uri)
197
+
198
+ if response.is_a?(Net::HTTPSuccess)
199
+ data = JSON.parse(response.body)
200
+ if data['ok']
201
+ bot = data['result']
202
+ { success: true, message: "Connected as bot @#{bot['username']}" }
203
+ else
204
+ { success: false, message: "Bot token invalid" }
205
+ end
206
+ else
207
+ { success: false, message: "Failed to connect to Telegram Bot API" }
208
+ end
209
+ end
210
+
211
+ def convert_bot_message(msg)
212
+ # Extract sender info
213
+ from = msg['from']
214
+ sender = from['username'] || "#{from['first_name']} #{from.fetch('last_name', '')}".strip
215
+
216
+ # Extract chat info
217
+ chat = msg['chat']
218
+ recipient = case chat['type']
219
+ when 'private'
220
+ 'Me (Bot)'
221
+ when 'group', 'supergroup'
222
+ chat['title']
223
+ when 'channel'
224
+ chat['title']
225
+ else
226
+ 'Unknown'
227
+ end
228
+
229
+ # Extract content
230
+ content = msg['text'] || msg['caption'] || ''
231
+ subject = content[0..50]
232
+ subject += "..." if content.length > 50
233
+
234
+ # Handle attachments
235
+ attachments = extract_bot_attachments(msg)
236
+
237
+ {
238
+ source_id: @source.id,
239
+ source_type: 'telegram',
240
+ external_id: "telegram_#{msg['message_id']}_#{chat['id']}",
241
+ sender: sender,
242
+ recipient: recipient,
243
+ subject: subject,
244
+ content: content,
245
+ raw_data: msg.to_json,
246
+ attachments: attachments,
247
+ timestamp: Time.at(msg['date']).iso8601,
248
+ is_read: 0
249
+ }
250
+ end
251
+
252
+ def extract_bot_attachments(msg)
253
+ attachments = []
254
+
255
+ # Photo
256
+ if msg['photo']
257
+ largest = msg['photo'].max_by { |p| p['file_size'] }
258
+ attachments << { type: 'photo', file_id: largest['file_id'] }
259
+ end
260
+
261
+ # Video
262
+ if msg['video']
263
+ attachments << {
264
+ type: 'video',
265
+ file_id: msg['video']['file_id'],
266
+ duration: msg['video']['duration']
267
+ }
268
+ end
269
+
270
+ # Document
271
+ if msg['document']
272
+ attachments << {
273
+ type: 'document',
274
+ file_id: msg['document']['file_id'],
275
+ file_name: msg['document']['file_name'],
276
+ mime_type: msg['document']['mime_type']
277
+ }
278
+ end
279
+
280
+ # Voice
281
+ if msg['voice']
282
+ attachments << {
283
+ type: 'voice',
284
+ file_id: msg['voice']['file_id'],
285
+ duration: msg['voice']['duration']
286
+ }
287
+ end
288
+
289
+ # Location
290
+ if msg['location']
291
+ attachments << {
292
+ type: 'location',
293
+ latitude: msg['location']['latitude'],
294
+ longitude: msg['location']['longitude']
295
+ }
296
+ end
297
+
298
+ # Sticker
299
+ if msg['sticker']
300
+ attachments << {
301
+ type: 'sticker',
302
+ file_id: msg['sticker']['file_id'],
303
+ emoji: msg['sticker']['emoji']
304
+ }
305
+ end
306
+
307
+ attachments.empty? ? nil : attachments.to_json
308
+ end
309
+
310
+ # MTProto Methods (full user account access via proxy server)
311
+
312
+ def fetch_mtproto_messages
313
+ messages = []
314
+
315
+ # This requires a separate MTProto proxy server
316
+ # Similar to WhatsApp's whatsmeow server
317
+ api_url = @config['mtproto_api_url'] || 'http://localhost:8081'
318
+
319
+ uri = URI("#{api_url}/messages")
320
+ params = {
321
+ session_string: @config['session_string'],
322
+ limit: @config['fetch_limit'] || 100
323
+ }
324
+
325
+ if @last_fetch_time && @config['incremental_sync']
326
+ params[:since] = (@last_fetch_time - 300).iso8601
327
+ end
328
+
329
+ uri.query = URI.encode_www_form(params)
330
+
331
+ response = Net::HTTP.get_response(uri)
332
+
333
+ if response.is_a?(Net::HTTPSuccess)
334
+ data = JSON.parse(response.body)
335
+
336
+ if data['messages']
337
+ data['messages'].each do |msg|
338
+ message = convert_mtproto_message(msg)
339
+ messages << message if message
340
+ end
341
+ end
342
+
343
+ @last_fetch_time = Time.now
344
+ else
345
+ puts "Telegram MTProto API error: #{response.code}" if ENV['DEBUG']
346
+ end
347
+
348
+ messages
349
+ end
350
+
351
+ def test_mtproto_connection
352
+ api_url = @config['mtproto_api_url'] || 'http://localhost:8081'
353
+
354
+ # Check if API server is running
355
+ uri = URI("#{api_url}/health")
356
+ response = Net::HTTP.get_response(uri)
357
+
358
+ unless response.is_a?(Net::HTTPSuccess)
359
+ return { success: false, message: "Telegram MTProto server not running at #{api_url}" }
360
+ end
361
+
362
+ # Check session status
363
+ if @config['session_string']
364
+ uri = URI("#{api_url}/session/status")
365
+ uri.query = URI.encode_www_form(session_string: @config['session_string'])
366
+ response = Net::HTTP.get_response(uri)
367
+
368
+ if response.is_a?(Net::HTTPSuccess)
369
+ data = JSON.parse(response.body)
370
+ if data['authenticated']
371
+ { success: true, message: "Connected as #{data['username'] || data['phone']}" }
372
+ else
373
+ { success: false, message: "Session expired. Re-authentication required." }
374
+ end
375
+ else
376
+ { success: false, message: "Failed to check session status" }
377
+ end
378
+ else
379
+ { success: false, message: "No session configured. Run authentication first." }
380
+ end
381
+ end
382
+
383
+ def authenticate_mtproto
384
+ api_url = @config['mtproto_api_url'] || 'http://localhost:8081'
385
+
386
+ puts "\n=== Telegram Authentication ==="
387
+ puts "Phone: #{@config['phone_number']}"
388
+
389
+ # Start authentication
390
+ uri = URI("#{api_url}/auth/start")
391
+ request = Net::HTTP::Post.new(uri)
392
+ request['Content-Type'] = 'application/json'
393
+ request.body = {
394
+ api_id: @config['api_id'],
395
+ api_hash: @config['api_hash'],
396
+ phone_number: @config['phone_number']
397
+ }.to_json
398
+
399
+ response = Net::HTTP.start(uri.hostname, uri.port) do |http|
400
+ http.request(request)
401
+ end
402
+
403
+ if response.is_a?(Net::HTTPSuccess)
404
+ data = JSON.parse(response.body)
405
+
406
+ if data['code_sent']
407
+ print "\nEnter the code sent to your Telegram app: "
408
+ code = gets.chomp
409
+
410
+ # Submit code
411
+ uri = URI("#{api_url}/auth/code")
412
+ request = Net::HTTP::Post.new(uri)
413
+ request['Content-Type'] = 'application/json'
414
+ request.body = {
415
+ session_id: data['session_id'],
416
+ code: code
417
+ }.to_json
418
+
419
+ response = Net::HTTP.start(uri.hostname, uri.port) do |http|
420
+ http.request(request)
421
+ end
422
+
423
+ if response.is_a?(Net::HTTPSuccess)
424
+ auth_data = JSON.parse(response.body)
425
+
426
+ if auth_data['requires_2fa']
427
+ print "Enter your 2FA password: "
428
+ password = gets.chomp
429
+
430
+ # Submit 2FA password
431
+ uri = URI("#{api_url}/auth/2fa")
432
+ request = Net::HTTP::Post.new(uri)
433
+ request['Content-Type'] = 'application/json'
434
+ request.body = {
435
+ session_id: data['session_id'],
436
+ password: password
437
+ }.to_json
438
+
439
+ response = Net::HTTP.start(uri.hostname, uri.port) do |http|
440
+ http.request(request)
441
+ end
442
+
443
+ if response.is_a?(Net::HTTPSuccess)
444
+ auth_data = JSON.parse(response.body)
445
+ end
446
+ end
447
+
448
+ if auth_data['session_string']
449
+ # Save session string to config
450
+ @config['session_string'] = auth_data['session_string']
451
+ puts "\n✓ Authentication successful!"
452
+ puts "Session saved. You won't need to authenticate again."
453
+ return true
454
+ end
455
+ end
456
+ end
457
+ end
458
+
459
+ puts "\n✗ Authentication failed"
460
+ false
461
+ end
462
+
463
+ def convert_mtproto_message(msg)
464
+ # Convert MTProto message format to Heathrow format
465
+ sender = msg['sender_name'] || msg['sender_username'] || msg['sender_id'].to_s
466
+ recipient = msg['chat_name'] || msg['chat_id'].to_s
467
+
468
+ content = msg['text'] || ''
469
+ subject = content[0..50]
470
+ subject += "..." if content.length > 50
471
+
472
+ # Handle media
473
+ attachments = []
474
+ if msg['media']
475
+ attachments << {
476
+ type: msg['media']['type'],
477
+ file_id: msg['media']['file_id'],
478
+ caption: msg['media']['caption']
479
+ }
480
+ end
481
+
482
+ {
483
+ source_id: @source.id,
484
+ source_type: 'telegram',
485
+ external_id: "telegram_#{msg['id']}",
486
+ sender: sender,
487
+ recipient: recipient,
488
+ subject: subject,
489
+ content: content,
490
+ raw_data: msg.to_json,
491
+ attachments: attachments.empty? ? nil : attachments.to_json,
492
+ timestamp: msg['date'] || Time.now.iso8601,
493
+ is_read: msg['is_read'] ? 1 : 0
494
+ }
495
+ end
496
+ end
497
+ end
498
+ end
@@ -0,0 +1,207 @@
1
+ require 'digest'
2
+ require 'shellwords'
3
+ require 'json'
4
+ require 'time'
5
+ require_relative 'base'
6
+
7
+ module Heathrow
8
+ module Sources
9
+ class Webpage < Base
10
+ def initialize(name, config, db)
11
+ super
12
+ @pages = config['pages'] || []
13
+ @snapshots_dir = File.join(Dir.home, '.heathrow', 'webwatch')
14
+ Dir.mkdir(@snapshots_dir) unless Dir.exist?(@snapshots_dir)
15
+ end
16
+
17
+ def sync(source_id)
18
+ count = 0
19
+ @pages.each do |page|
20
+ begin
21
+ count += check_page(source_id, page)
22
+ rescue => e
23
+ STDERR.puts "Webwatch error #{page['url']}: #{e.message}" if ENV['DEBUG']
24
+ end
25
+ end
26
+ count
27
+ end
28
+
29
+ def fetch
30
+ return [] unless enabled?
31
+ source = @db.get_source_by_name(@name)
32
+ return [] unless source
33
+ sync(source['id'])
34
+ update_last_fetch
35
+ []
36
+ end
37
+
38
+ def add_page(url, title: nil, selector: nil, tags: [])
39
+ entry = { 'url' => url, 'title' => title, 'selector' => selector, 'tags' => tags }
40
+ @pages << entry unless @pages.any? { |p| p['url'] == url }
41
+ @config['pages'] = @pages
42
+ save_config
43
+ end
44
+
45
+ def remove_page(url)
46
+ @pages.reject! { |p| p['url'] == url }
47
+ @config['pages'] = @pages
48
+ save_config
49
+ end
50
+
51
+ def list_pages
52
+ @pages.map { |p| { url: p['url'], title: p['title'], selector: p['selector'], tags: p['tags'] || [] } }
53
+ end
54
+
55
+ private
56
+
57
+ def check_page(source_id, page)
58
+ url = page['url']
59
+ title = page['title'] || url
60
+ selector = page['selector']
61
+ tags = page['tags'] || []
62
+
63
+ raw = http_get(url)
64
+ return 0 unless raw && !raw.empty?
65
+
66
+ # Extract relevant content
67
+ content = if selector && !selector.empty?
68
+ extract_by_selector(raw, selector)
69
+ else
70
+ extract_body(raw)
71
+ end
72
+ return 0 if content.nil? || content.empty?
73
+
74
+ # Normalize whitespace for comparison
75
+ normalized = content.gsub(/\s+/, ' ').strip
76
+
77
+ # Compare with stored snapshot
78
+ snapshot_file = File.join(@snapshots_dir, Digest::MD5.hexdigest(url))
79
+ old_content = File.exist?(snapshot_file) ? File.read(snapshot_file) : nil
80
+
81
+ # First run — store snapshot, no message
82
+ if old_content.nil?
83
+ File.write(snapshot_file, normalized)
84
+ return 0
85
+ end
86
+
87
+ # No change
88
+ return 0 if Digest::MD5.hexdigest(normalized) == Digest::MD5.hexdigest(old_content)
89
+
90
+ # Changed! Generate diff and create message
91
+ diff = generate_diff(old_content, normalized)
92
+ File.write(snapshot_file, normalized)
93
+
94
+ ext_id = "webwatch_#{Digest::MD5.hexdigest(url + Time.now.to_i.to_s)}"
95
+
96
+ data = {
97
+ source_id: source_id,
98
+ external_id: ext_id,
99
+ sender: title,
100
+ sender_name: title,
101
+ recipients: ['Web Watch'],
102
+ subject: "Changed: #{title}",
103
+ content: diff,
104
+ html_content: diff,
105
+ timestamp: Time.now.to_i,
106
+ received_at: Time.now.to_i,
107
+ read: false,
108
+ starred: false,
109
+ archived: false,
110
+ labels: ['Web Watch'] + tags,
111
+ metadata: {
112
+ link: url,
113
+ page_title: title,
114
+ selector: selector,
115
+ tags: tags,
116
+ changed_at: Time.now.iso8601
117
+ },
118
+ raw_data: { link: url, page_title: title }
119
+ }
120
+
121
+ begin
122
+ @db.insert_message(data)
123
+ 1
124
+ rescue SQLite3::ConstraintException
125
+ 0
126
+ end
127
+ end
128
+
129
+ def http_get(url)
130
+ result = `curl -sL --max-time 20 --max-redirs 8 -A 'Mozilla/5.0 (X11; Linux x86_64) Heathrow/1.0' #{Shellwords.escape(url)} 2>/dev/null`
131
+ $?.success? && !result.empty? ? result : nil
132
+ rescue => e
133
+ STDERR.puts "Webwatch fetch error #{url}: #{e.message}" if ENV['DEBUG']
134
+ nil
135
+ end
136
+
137
+ def extract_body(html)
138
+ # Strip script/style/head tags, then extract text
139
+ html.gsub(/<script[^>]*>.*?<\/script>/mi, '')
140
+ .gsub(/<style[^>]*>.*?<\/style>/mi, '')
141
+ .gsub(/<head[^>]*>.*?<\/head>/mi, '')
142
+ .gsub(/<nav[^>]*>.*?<\/nav>/mi, '')
143
+ .then { |h| strip_html(h) }
144
+ end
145
+
146
+ def extract_by_selector(html, selector)
147
+ # Simple CSS selector extraction via regex
148
+ # Supports: #id, .class, tag, tag.class
149
+ pattern = case selector
150
+ when /^#([\w-]+)$/
151
+ /<[^>]+id\s*=\s*["']#{Regexp.escape($1)}["'][^>]*>(.*?)<\/[^>]+>/mi
152
+ when /^\.([\w-]+)$/
153
+ /<[^>]+class\s*=\s*["'][^"']*\b#{Regexp.escape($1)}\b[^"']*["'][^>]*>(.*?)<\/[^>]+>/mi
154
+ when /^(\w+)$/
155
+ /<#{Regexp.escape($1)}[^>]*>(.*?)<\/#{Regexp.escape($1)}>/mi
156
+ when /^(\w+)\.([\w-]+)$/
157
+ /<#{Regexp.escape($1)}[^>]+class\s*=\s*["'][^"']*\b#{Regexp.escape($2)}\b[^"']*["'][^>]*>(.*?)<\/#{Regexp.escape($1)}>/mi
158
+ else
159
+ nil
160
+ end
161
+
162
+ if pattern
163
+ matches = html.scan(pattern).flatten
164
+ strip_html(matches.join("\n"))
165
+ else
166
+ extract_body(html)
167
+ end
168
+ end
169
+
170
+ def generate_diff(old_text, new_text)
171
+ old_lines = old_text.split('. ').map(&:strip).reject(&:empty?)
172
+ new_lines = new_text.split('. ').map(&:strip).reject(&:empty?)
173
+
174
+ removed = old_lines - new_lines
175
+ added = new_lines - old_lines
176
+
177
+ diff = []
178
+ unless removed.empty?
179
+ diff << "REMOVED:"
180
+ removed.first(15).each { |l| diff << " - #{l}" }
181
+ diff << " ... (#{removed.size - 15} more)" if removed.size > 15
182
+ end
183
+ unless added.empty?
184
+ diff << "" unless diff.empty?
185
+ diff << "ADDED:"
186
+ added.first(15).each { |l| diff << " + #{l}" }
187
+ diff << " ... (#{added.size - 15} more)" if added.size > 15
188
+ end
189
+
190
+ diff.empty? ? "Content changed (details differ in whitespace/structure)" : diff.join("\n")
191
+ end
192
+
193
+ def strip_html(text)
194
+ return '' if text.nil? || text.empty?
195
+ text.gsub(/<[^>]*>/, '')
196
+ .gsub(/&nbsp;/, ' ')
197
+ .gsub(/&amp;/, '&')
198
+ .gsub(/&lt;/, '<')
199
+ .gsub(/&gt;/, '>')
200
+ .gsub(/&quot;/, '"')
201
+ .gsub(/&#39;/, "'")
202
+ .gsub(/\n\s*\n\s*\n/, "\n\n")
203
+ .strip
204
+ end
205
+ end
206
+ end
207
+ end