webhookdb 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/db/migrations/026_undo_integration_backfill_cursor.rb +2 -0
  3. data/db/migrations/032_remove_db_defaults.rb +2 -0
  4. data/db/migrations/043_text_search.rb +2 -0
  5. data/db/migrations/047_sync_parallelism.rb +9 -0
  6. data/db/migrations/048_sync_stats.rb +9 -0
  7. data/db/migrations/049_error_handlers.rb +18 -0
  8. data/db/migrations/050_logged_webhook_indices.rb +25 -0
  9. data/db/migrations/051_partitioning.rb +9 -0
  10. data/integration/async_spec.rb +0 -2
  11. data/integration/service_integrations_spec.rb +0 -2
  12. data/lib/amigo/durable_job.rb +2 -2
  13. data/lib/amigo/job_in_context.rb +12 -0
  14. data/lib/webhookdb/api/entities.rb +6 -2
  15. data/lib/webhookdb/api/error_handlers.rb +104 -0
  16. data/lib/webhookdb/api/helpers.rb +8 -1
  17. data/lib/webhookdb/api/icalproxy.rb +22 -0
  18. data/lib/webhookdb/api/install.rb +2 -1
  19. data/lib/webhookdb/api/saved_queries.rb +1 -0
  20. data/lib/webhookdb/api/saved_views.rb +1 -0
  21. data/lib/webhookdb/api/service_integrations.rb +1 -1
  22. data/lib/webhookdb/api/sync_targets.rb +1 -1
  23. data/lib/webhookdb/api/system.rb +5 -0
  24. data/lib/webhookdb/api/webhook_subscriptions.rb +1 -0
  25. data/lib/webhookdb/api.rb +4 -1
  26. data/lib/webhookdb/apps.rb +4 -0
  27. data/lib/webhookdb/async/autoscaler.rb +10 -0
  28. data/lib/webhookdb/async/job.rb +4 -0
  29. data/lib/webhookdb/async/scheduled_job.rb +4 -0
  30. data/lib/webhookdb/async.rb +2 -0
  31. data/lib/webhookdb/backfiller.rb +17 -4
  32. data/lib/webhookdb/concurrent.rb +96 -0
  33. data/lib/webhookdb/connection_cache.rb +29 -8
  34. data/lib/webhookdb/customer.rb +2 -2
  35. data/lib/webhookdb/database_document.rb +1 -1
  36. data/lib/webhookdb/db_adapter/default_sql.rb +1 -14
  37. data/lib/webhookdb/db_adapter/partition.rb +14 -0
  38. data/lib/webhookdb/db_adapter/partitioning.rb +8 -0
  39. data/lib/webhookdb/db_adapter/pg.rb +77 -5
  40. data/lib/webhookdb/db_adapter/snowflake.rb +15 -6
  41. data/lib/webhookdb/db_adapter.rb +24 -2
  42. data/lib/webhookdb/fixtures/logged_webhooks.rb +4 -0
  43. data/lib/webhookdb/fixtures/organization_error_handlers.rb +20 -0
  44. data/lib/webhookdb/http.rb +29 -15
  45. data/lib/webhookdb/icalendar.rb +30 -9
  46. data/lib/webhookdb/jobs/amigo_test_jobs.rb +1 -1
  47. data/lib/webhookdb/jobs/backfill.rb +21 -25
  48. data/lib/webhookdb/jobs/create_mirror_table.rb +3 -4
  49. data/lib/webhookdb/jobs/deprecated_jobs.rb +2 -0
  50. data/lib/webhookdb/jobs/emailer.rb +2 -1
  51. data/lib/webhookdb/jobs/front_signalwire_message_channel_sync_inbound.rb +15 -0
  52. data/lib/webhookdb/jobs/icalendar_delete_stale_cancelled_events.rb +7 -2
  53. data/lib/webhookdb/jobs/icalendar_enqueue_syncs.rb +74 -11
  54. data/lib/webhookdb/jobs/icalendar_enqueue_syncs_for_urls.rb +22 -0
  55. data/lib/webhookdb/jobs/icalendar_sync.rb +21 -9
  56. data/lib/webhookdb/jobs/increase_event_handler.rb +3 -2
  57. data/lib/webhookdb/jobs/logged_webhooks_replay.rb +5 -3
  58. data/lib/webhookdb/jobs/message_dispatched.rb +1 -0
  59. data/lib/webhookdb/jobs/model_event_system_log_tracker.rb +7 -0
  60. data/lib/webhookdb/jobs/monitor_metrics.rb +1 -1
  61. data/lib/webhookdb/jobs/organization_database_migration_notify.rb +32 -0
  62. data/lib/webhookdb/jobs/organization_database_migration_run.rb +4 -6
  63. data/lib/webhookdb/jobs/organization_error_handler_dispatch.rb +26 -0
  64. data/lib/webhookdb/jobs/prepare_database_connections.rb +1 -0
  65. data/lib/webhookdb/jobs/process_webhook.rb +11 -12
  66. data/lib/webhookdb/jobs/renew_watch_channel.rb +7 -10
  67. data/lib/webhookdb/jobs/replication_migration.rb +5 -2
  68. data/lib/webhookdb/jobs/reset_code_create_dispatch.rb +1 -2
  69. data/lib/webhookdb/jobs/scheduled_backfills.rb +2 -2
  70. data/lib/webhookdb/jobs/send_invite.rb +3 -2
  71. data/lib/webhookdb/jobs/send_test_webhook.rb +1 -3
  72. data/lib/webhookdb/jobs/send_webhook.rb +4 -5
  73. data/lib/webhookdb/jobs/stale_row_deleter.rb +31 -0
  74. data/lib/webhookdb/jobs/sync_target_enqueue_scheduled.rb +3 -0
  75. data/lib/webhookdb/jobs/sync_target_run_sync.rb +9 -15
  76. data/lib/webhookdb/jobs/webhook_subscription_delivery_event.rb +5 -8
  77. data/lib/webhookdb/liquid/expose.rb +1 -1
  78. data/lib/webhookdb/liquid/filters.rb +1 -1
  79. data/lib/webhookdb/liquid/partial.rb +2 -2
  80. data/lib/webhookdb/logged_webhook/resilient.rb +3 -3
  81. data/lib/webhookdb/logged_webhook.rb +16 -2
  82. data/lib/webhookdb/message/email_transport.rb +1 -1
  83. data/lib/webhookdb/message.rb +2 -2
  84. data/lib/webhookdb/messages/error_generic_backfill.rb +2 -0
  85. data/lib/webhookdb/messages/error_icalendar_fetch.rb +2 -0
  86. data/lib/webhookdb/messages/error_signalwire_send_sms.rb +2 -0
  87. data/lib/webhookdb/organization/alerting.rb +50 -4
  88. data/lib/webhookdb/organization/database_migration.rb +1 -1
  89. data/lib/webhookdb/organization/db_builder.rb +4 -3
  90. data/lib/webhookdb/organization/error_handler.rb +141 -0
  91. data/lib/webhookdb/organization.rb +62 -9
  92. data/lib/webhookdb/postgres/model_utilities.rb +2 -0
  93. data/lib/webhookdb/postgres.rb +1 -3
  94. data/lib/webhookdb/replicator/base.rb +136 -29
  95. data/lib/webhookdb/replicator/base_stale_row_deleter.rb +165 -0
  96. data/lib/webhookdb/replicator/email_octopus_contact_v1.rb +0 -1
  97. data/lib/webhookdb/replicator/fake.rb +100 -88
  98. data/lib/webhookdb/replicator/front_signalwire_message_channel_app_v1.rb +105 -44
  99. data/lib/webhookdb/replicator/github_repo_v1_mixin.rb +17 -0
  100. data/lib/webhookdb/replicator/icalendar_calendar_v1.rb +144 -23
  101. data/lib/webhookdb/replicator/icalendar_event_v1.rb +20 -44
  102. data/lib/webhookdb/replicator/icalendar_event_v1_partitioned.rb +33 -0
  103. data/lib/webhookdb/replicator/intercom_contact_v1.rb +1 -0
  104. data/lib/webhookdb/replicator/intercom_conversation_v1.rb +1 -0
  105. data/lib/webhookdb/replicator/intercom_v1_mixin.rb +24 -2
  106. data/lib/webhookdb/replicator/partitionable_mixin.rb +116 -0
  107. data/lib/webhookdb/replicator/shopify_v1_mixin.rb +1 -1
  108. data/lib/webhookdb/replicator/signalwire_message_v1.rb +1 -2
  109. data/lib/webhookdb/replicator/sponsy_v1_mixin.rb +1 -1
  110. data/lib/webhookdb/replicator/transistor_episode_stats_v1.rb +0 -1
  111. data/lib/webhookdb/replicator.rb +4 -1
  112. data/lib/webhookdb/service/helpers.rb +4 -0
  113. data/lib/webhookdb/service/middleware.rb +6 -2
  114. data/lib/webhookdb/service_integration.rb +5 -0
  115. data/lib/webhookdb/signalwire.rb +1 -1
  116. data/lib/webhookdb/spec_helpers/async.rb +0 -4
  117. data/lib/webhookdb/spec_helpers/sentry.rb +32 -0
  118. data/lib/webhookdb/spec_helpers/shared_examples_for_replicators.rb +87 -1
  119. data/lib/webhookdb/spec_helpers.rb +1 -0
  120. data/lib/webhookdb/sync_target.rb +195 -29
  121. data/lib/webhookdb/tasks/admin.rb +1 -1
  122. data/lib/webhookdb/tasks/annotate.rb +1 -1
  123. data/lib/webhookdb/tasks/db.rb +13 -1
  124. data/lib/webhookdb/tasks/docs.rb +1 -1
  125. data/lib/webhookdb/tasks/fixture.rb +1 -1
  126. data/lib/webhookdb/tasks/message.rb +1 -1
  127. data/lib/webhookdb/tasks/regress.rb +1 -1
  128. data/lib/webhookdb/tasks/release.rb +1 -1
  129. data/lib/webhookdb/tasks/sidekiq.rb +1 -1
  130. data/lib/webhookdb/tasks/specs.rb +1 -1
  131. data/lib/webhookdb/version.rb +1 -1
  132. data/lib/webhookdb/webhook_subscription.rb +2 -3
  133. data/lib/webhookdb.rb +3 -1
  134. metadata +88 -54
  135. data/lib/webhookdb/jobs/organization_database_migration_notify_finished.rb +0 -21
  136. data/lib/webhookdb/jobs/organization_database_migration_notify_started.rb +0 -21
@@ -77,6 +77,7 @@ The secret to use for signing is:
77
77
  col.new(:event_count, INTEGER, optional: true),
78
78
  col.new(:feed_bytes, INTEGER, optional: true),
79
79
  col.new(:last_sync_duration_ms, INTEGER, optional: true),
80
+ col.new(:last_fetch_context, OBJECT, optional: true),
80
81
  ]
81
82
  end
82
83
 
@@ -111,7 +112,7 @@ The secret to use for signing is:
111
112
  external_id = request.body.fetch("external_id")
112
113
  case request_type
113
114
  when "SYNC"
114
- super(request)
115
+ super
115
116
  Webhookdb::Jobs::IcalendarSync.perform_async(self.service_integration.id, external_id)
116
117
  return
117
118
  when "DELETE"
@@ -121,14 +122,12 @@ The secret to use for signing is:
121
122
  unless Webhookdb::RACK_ENV == "test"
122
123
  raise "someone tried to use the special unit test google event type outside of unit tests"
123
124
  end
124
- return super(request)
125
+ return super
125
126
  else
126
127
  raise ArgumentError, "Unknown request type: #{request_type}"
127
128
  end
128
129
  end
129
130
 
130
- CLEANUP_SERVICE_NAMES = ["icalendar_event_v1"].freeze
131
-
132
131
  def rows_needing_sync(dataset, now: Time.now)
133
132
  cutoff = now - Webhookdb::Icalendar.sync_period_hours.hours
134
133
  return dataset.where(Sequel[last_synced_at: nil] | Sequel.expr { last_synced_at < cutoff })
@@ -136,7 +135,7 @@ The secret to use for signing is:
136
135
 
137
136
  def delete_data_for_external_id(external_id)
138
137
  relevant_integrations = self.service_integration.recursive_dependents.
139
- filter { |d| CLEANUP_SERVICE_NAMES.include?(d.service_name) }
138
+ filter { |d| Webhookdb::Icalendar::EVENT_REPLICATORS.include?(d.service_name) }
140
139
  self.admin_dataset do |ds|
141
140
  ds.db.transaction do
142
141
  ds.where(external_id:).delete
@@ -157,7 +156,7 @@ The secret to use for signing is:
157
156
  @now = now
158
157
  end
159
158
 
160
- def upsert_page_size = 500
159
+ def upsert_page_size = 2000
161
160
  def conditional_upsert? = true
162
161
 
163
162
  def prepare_body(body)
@@ -166,12 +165,33 @@ The secret to use for signing is:
166
165
  end
167
166
  end
168
167
 
169
- def sync_row(row)
168
+ def sync_row(row, force: false, now: Time.now)
170
169
  Appydays::Loggable.with_log_tags(icalendar_url: row.fetch(:ics_url)) do
170
+ last_synced_at = row.fetch(:last_synced_at)
171
+ should_sync = force ||
172
+ last_synced_at.nil? ||
173
+ # If a proxy is configured, we always want to try to sync,
174
+ # since this could have come from a webhook, but also the proxy feed refresh TTL
175
+ # is likely much lower than ICALENDAR_SYNC_PERIOD_HOURS so it's good to check on it.
176
+ # The check is very fast (should 304) so is safe to do relatively often.
177
+ Webhookdb::Icalendar.proxy_url.present? ||
178
+ last_synced_at < (now - Webhookdb::Icalendar.sync_period_hours.hours)
179
+ unless should_sync
180
+ self.logger.info("skip_sync_recently_synced", last_synced_at:)
181
+ return
182
+ end
171
183
  self.with_advisory_lock(row.fetch(:pk)) do
172
184
  start = Time.now
173
- now = Time.now
174
- if (dep = self.find_dependent("icalendar_event_v1"))
185
+ if (dep = self.find_dependent(Webhookdb::Icalendar::EVENT_REPLICATORS))
186
+ if dep.replicator.avoid_writes?
187
+ # Check if this table is being vacuumed/etc. We use this instead of a semaphore job,
188
+ # since it's a better fit for icalendar, which is pre-scheduled, rather than reactive.
189
+ # That is, when we receive webhooks, a semaphore job gives us a more predictable rate;
190
+ # but icalendar rate is negotiated in advance (when enqueing jobs),
191
+ # and we can be more 'helpful' to something like a vacuum by not running any jobs at all.
192
+ self.logger.info("skip_sync_table_locked")
193
+ raise Amigo::Retry::Retry, 60.seconds + (rand * 10.seconds)
194
+ end
175
195
  processor = self._sync_row(row, dep, now:)
176
196
  end
177
197
  self.admin_dataset do |ds|
@@ -181,6 +201,12 @@ The secret to use for signing is:
181
201
  event_count: processor&.upserted_identities&.count,
182
202
  feed_bytes: processor&.read_bytes,
183
203
  last_sync_duration_ms: (Time.now - start).in_milliseconds,
204
+ last_fetch_context: {
205
+ "hash" => processor&.feed_hash,
206
+ "content_type" => processor&.headers&.fetch("Content-Type", nil),
207
+ "content_length" => processor&.headers&.fetch("Content-Length", nil),
208
+ "etag" => processor&.headers&.fetch("Etag", nil),
209
+ }.to_json,
184
210
  )
185
211
  end
186
212
  end
@@ -191,14 +217,19 @@ The secret to use for signing is:
191
217
  calendar_external_id = row.fetch(:external_id)
192
218
  begin
193
219
  request_url = self._clean_ics_url(row.fetch(:ics_url))
194
- io = Webhookdb::Http.chunked_download(request_url, rewindable: false)
195
- rescue Down::Error, URI::InvalidURIError => e
220
+ io = self._make_ics_request(request_url, row.fetch(:last_fetch_context))
221
+ rescue Down::Error,
222
+ URI::InvalidURIError,
223
+ HTTPX::NativeResolveError,
224
+ HTTPX::InsecureRedirectError,
225
+ HTTPX::Connection::HTTP2::Error,
226
+ EOFError => e
196
227
  self._handle_down_error(e, request_url:, calendar_external_id:)
197
228
  return
198
229
  end
199
230
 
200
231
  upserter = Upserter.new(dep.replicator, calendar_external_id, now:)
201
- processor = EventProcessor.new(io, upserter)
232
+ processor = EventProcessor.new(io:, upserter:, headers: io.data[:headers])
202
233
  processor.process
203
234
  # Delete all the extra replicator rows, and cancel all the rows that weren't upserted.
204
235
  dep.replicator.admin_dataset do |ds|
@@ -219,6 +250,24 @@ The secret to use for signing is:
219
250
  return processor
220
251
  end
221
252
 
253
+ def _make_ics_request(request_url, last_fetch_context)
254
+ # Some servers require a VERY explicit accept header,
255
+ # so tell them we prefer icalendar here.
256
+ # Using Httpx, Accept-Encoding is gzip,deflate
257
+ # which seems fine (server should use identity as worst case).
258
+ headers = {
259
+ "Accept" => "text/calendar,*/*",
260
+ }
261
+ headers["If-None-Match"] = last_fetch_context["etag"] if last_fetch_context & ["etag"]
262
+ if (proxy_url = Webhookdb::Icalendar.proxy_url).present?
263
+ request_url = "#{proxy_url.delete_suffix('/')}/?url=#{URI.encode_www_form_component(request_url)}"
264
+ headers["Authorization"] = "Apikey #{Webhookdb::Icalendar.proxy_api_key}" if
265
+ Webhookdb::Icalendar.proxy_api_key.present?
266
+ end
267
+ resp = Webhookdb::Http.chunked_download(request_url, rewindable: false, headers:)
268
+ return resp
269
+ end
270
+
222
271
  # We get all sorts of strange urls, fix up what we can.
223
272
  def _clean_ics_url(url)
224
273
  u = URI(url)
@@ -251,12 +300,17 @@ The secret to use for signing is:
251
300
  else
252
301
  self._handle_retryable_down_error!(e, request_url:, calendar_external_id:)
253
302
  end
254
- when Down::TimeoutError, Down::ConnectionError, Down::InvalidUrl, URI::InvalidURIError
303
+ when Down::TimeoutError, Down::ConnectionError, Down::InvalidUrl,
304
+ Errno::ECONNRESET,
305
+ URI::InvalidURIError,
306
+ HTTPX::NativeResolveError, HTTPX::InsecureRedirectError,
307
+ HTTPX::Connection::HTTP2::Error,
308
+ EOFError
255
309
  response_status = 0
256
310
  response_body = e.to_s
257
311
  when Down::ClientError
258
312
  raise e if e.response.nil?
259
- response_status = e.response.code.to_i
313
+ response_status = e.response.status.to_i
260
314
  self._handle_retryable_down_error!(e, request_url:, calendar_external_id:) if
261
315
  self._retryable_client_error?(e, request_url:)
262
316
  # These are all the errors we've seen, we can't do anything about.
@@ -268,18 +322,24 @@ The secret to use for signing is:
268
322
  404, 405, # Fundamental issues with the URL given
269
323
  409, 410, # More access problems
270
324
  417, # If someone uses an Outlook HTML calendar, fetch gives us a 417
325
+ 422, # Sometimes used instead of 404
271
326
  429, # Usually 429s are retried (as above), but in some cases they're not.
327
+ 500, 503, 504, # Intermittent server issues, usually
328
+ 599, # Represents a timeout in icalproxy
272
329
  ]
273
330
  # For most client errors, we can't do anything about it. For example,
274
331
  # and 'unshared' URL could result in a 401, 403, 404, or even a 405.
275
332
  # For now, other client errors, we can raise on,
276
333
  # in case it's something we can fix/work around.
277
334
  # For example, it's possible something like a 415 is a WebhookDB issue.
335
+ if response_status == 421 && (origin_err = e.response.headers["Ical-Proxy-Origin-Error"])
336
+ response_status = origin_err.to_i
337
+ end
278
338
  raise e unless expected_errors.include?(response_status)
279
- response_body = e.response.body.to_s
339
+ response_body = self._safe_read_body(e)
280
340
  when Down::ServerError
281
- response_status = e.response.code.to_i
282
- response_body = e.response.body.to_s
341
+ response_status = e.response.status.to_i
342
+ response_body = self._safe_read_body(e)
283
343
  else
284
344
  response_body = nil
285
345
  response_status = nil
@@ -299,8 +359,16 @@ The secret to use for signing is:
299
359
  self.service_integration.organization.alerting.dispatch_alert(message, separate_connection: false)
300
360
  end
301
361
 
362
+ # We can hit an error while reading the error body, since it was opened as a stream.
363
+ # Ignore those errors.
364
+ def _safe_read_body(e)
365
+ return e.response.body.to_s
366
+ rescue OpenSSL::SSL::SSLError, HTTPX::Error
367
+ return "<error reading body>"
368
+ end
369
+
302
370
  def _retryable_client_error?(e, request_url:)
303
- code = e.response.code.to_i
371
+ code = e.response.status.to_i
304
372
  # This is a bad domain that returns 429 for most requests.
305
373
  # Tell the org admins it won't sync.
306
374
  return false if code == 429 && request_url.start_with?("https://ical.schedulestar.com")
@@ -317,7 +385,7 @@ The secret to use for signing is:
317
385
  retry_in = rand(4..60).minutes
318
386
  self.logger.debug(
319
387
  "icalendar_fetch_error_retry",
320
- response_status: e.respond_to?(:response) ? e.response&.code : 0,
388
+ response_status: e.respond_to?(:response) ? e.response&.status : 0,
321
389
  request_url:,
322
390
  calendar_external_id:,
323
391
  retry_at: Time.now + retry_in,
@@ -326,11 +394,12 @@ The secret to use for signing is:
326
394
  end
327
395
 
328
396
  class EventProcessor
329
- attr_reader :upserted_identities, :read_bytes
397
+ attr_reader :upserted_identities, :read_bytes, :headers
330
398
 
331
- def initialize(io, upserter)
399
+ def initialize(io:, upserter:, headers:)
332
400
  @io = io
333
401
  @upserter = upserter
402
+ @headers = headers
334
403
  # Keep track of everything we upsert. For any rows we aren't upserting,
335
404
  # delete them if they're recurring, or cancel them if they're not recurring.
336
405
  # If doing it this way is slow, we could invert this (pull down all IDs and pop from the set).
@@ -346,8 +415,11 @@ The secret to use for signing is:
346
415
  # Keep track of the bytes we've read from the file.
347
416
  # Never trust Content-Length headers for ical feeds.
348
417
  @read_bytes = 0
418
+ @feed_md5 = Digest::MD5.new
349
419
  end
350
420
 
421
+ def feed_hash = @feed_md5.hexdigest
422
+
351
423
  def delete_condition
352
424
  return nil if @max_sequence_num_by_uid.empty?
353
425
  return @max_sequence_num_by_uid.map do |uid, n|
@@ -486,7 +558,14 @@ The secret to use for signing is:
486
558
  # The new UID has the sequence number.
487
559
  e["UID"] = {"v" => "#{uid}-#{idx}"}
488
560
  e["DTSTART"] = self._ical_entry_from_ruby(occ.start_time, start_entry, is_date)
489
- e["DTEND"] = self._ical_entry_from_ruby(occ.end_time, end_entry, is_date) if has_end_time
561
+ if has_end_time
562
+ if !is_date && end_entry["VALUE"] == "DATE"
563
+ # It's possible that DTSTART is a time, but DTEND is a date. This makes no sense,
564
+ # so skip setting an end date. It will be in the :data column at least.
565
+ else
566
+ e["DTEND"] = self._ical_entry_from_ruby(occ.end_time, end_entry, is_date)
567
+ end
568
+ end
490
569
  yield e
491
570
  final_sequence = idx
492
571
  break if occ.start_time > dont_project_after
@@ -508,7 +587,11 @@ The secret to use for signing is:
508
587
  return {"v" => r.strftime("%Y%m%dT%H%M%S"), "TZID" => tzid} if tzid
509
588
  value = entry.fetch("v")
510
589
  return {"v" => value} if value.end_with?("Z")
511
- raise "Cannot create ical entry from: #{r}, #{entry}, is_date: #{is_date}"
590
+ if /^\d{8}T\d{6}$/.match?(value)
591
+ @upserter.upserting_replicator.logger.warn "ical_assuming_utc_time", ical_entry: entry, ruby_time: r
592
+ return {"v" => "#{value}Z"}
593
+ end
594
+ raise "Cannot create ical entry from: '#{r}', #{entry}"
512
595
  end
513
596
 
514
597
  def _icecube_rule_from_ical(ical)
@@ -551,6 +634,7 @@ The secret to use for signing is:
551
634
  in_vevent = false
552
635
  while (line = @io.gets)
553
636
  @read_bytes += line.size
637
+ @feed_md5.update(line)
554
638
  begin
555
639
  line.rstrip!
556
640
  rescue Encoding::CompatibilityError
@@ -589,4 +673,41 @@ The secret to use for signing is:
589
673
  @upserter.upserting_replicator.logger.warn("invalid_vevent_hash", vevent_uids: bad_event_uids.sort)
590
674
  end
591
675
  end
676
+
677
+ # Return true if the data in the feed has changed from what was last synced,
678
+ # or false if it has not so the sync can be skipped.
679
+ # This operation is meant to be resource-light (most of the work is the HTTP request),
680
+ # so should be done in a threadpool.
681
+ #
682
+ # - If we have no previous fetch context, we sync.
683
+ # - If the fetch errors, sync, because we want the normal error handler to figure it out
684
+ # (alert admins, etc).
685
+ # - If the last fetch's content type and length is different from the current, we sync.
686
+ # - Download the bytes. If the hash of the bytes is different from what was last processed,
687
+ # sync. Since this involves reading the streaming body, we must return a copy of the body (a StringIO).
688
+ def feed_changed?(row)
689
+ last_fetch = row.fetch(:last_fetch_context)
690
+ return true if last_fetch.nil? || last_fetch.empty?
691
+
692
+ begin
693
+ url = self._clean_ics_url(row.fetch(:ics_url))
694
+ resp = self._make_ics_request(url, last_fetch)
695
+ rescue Down::NotModified
696
+ return false
697
+ rescue StandardError
698
+ return true
699
+ end
700
+ headers = resp.data[:headers] || {}
701
+ content_type_match = headers["Content-Type"] == last_fetch["content_type"] &&
702
+ headers["Content-Length"] == last_fetch["content_length"]
703
+ return true unless content_type_match
704
+ last_hash = last_fetch["hash"]
705
+ return true if last_hash.nil?
706
+
707
+ hash = Digest::MD5.new
708
+ while (line = resp.gets)
709
+ hash.update(line)
710
+ end
711
+ return hash.hexdigest != last_hash
712
+ end
592
713
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "webhookdb/icalendar"
4
4
  require "webhookdb/windows_tz"
5
+ require "webhookdb/replicator/base_stale_row_deleter"
5
6
 
6
7
  class Webhookdb::Replicator::IcalendarEventV1 < Webhookdb::Replicator::Base
7
8
  include Appydays::Loggable
@@ -115,7 +116,6 @@ class Webhookdb::Replicator::IcalendarEventV1 < Webhookdb::Replicator::Base
115
116
  :compound_identity,
116
117
  TEXT,
117
118
  data_key: "<compound key, see converter>",
118
- index: true,
119
119
  converter: CONV_REMOTE_KEY,
120
120
  optional: true, # This is done via the converter, data_key never exists
121
121
  )
@@ -166,6 +166,7 @@ class Webhookdb::Replicator::IcalendarEventV1 < Webhookdb::Replicator::Base
166
166
  data.delete("calendar_external_id")
167
167
  data.delete("recurring_event_id")
168
168
  data.delete("recurring_event_sequence")
169
+ data.delete("row_updated_at")
169
170
  return data
170
171
  end
171
172
 
@@ -215,11 +216,21 @@ class Webhookdb::Replicator::IcalendarEventV1 < Webhookdb::Replicator::Base
215
216
  columns: [:calendar_external_id, :start_date, :end_date],
216
217
  where: Sequel[:status].is_distinct_from("CANCELLED") & (Sequel[:start_date] !~ nil),
217
218
  ),
219
+ Webhookdb::Replicator::IndexSpec.new(
220
+ columns: [:row_updated_at],
221
+ where: Sequel[status: "CANCELLED"],
222
+ identifier: "cancelled_row_updated_at",
223
+ ),
218
224
  ]
219
225
  end
220
226
 
221
227
  def _update_where_expr
222
- return self.qualified_table_sequel_identifier[:last_modified_at] < Sequel[:excluded][:last_modified_at]
228
+ # Compare against data to avoid the constant writes. JSONB != operations are very fast,
229
+ # so this should not be any real performance issue.
230
+ # last_modified_at is unreliable because LAST-MODIFIED is unreliable,
231
+ # even in feeds it is set. There are cases, such as adding an EXDATE to an RRULE,
232
+ # that do not trigger LAST-MODIFIED changes.
233
+ return self.qualified_table_sequel_identifier[:data] !~ Sequel[:excluded][:data]
223
234
  end
224
235
 
225
236
  # @param [Array<String>] lines
@@ -369,50 +380,15 @@ class Webhookdb::Replicator::IcalendarEventV1 < Webhookdb::Replicator::Base
369
380
  # +stale_at+ to +age_cutoff+. This avoids endlessly adding to the icalendar events table
370
381
  # due to feeds that change UIDs each fetch- events with changed UIDs will become CANCELLED,
371
382
  # and then deleted over time.
372
- # @param stale_at [Time] When an event is considered 'stale'.
373
- # If stale events are a big problem, this can be shortened to just a few days.
374
- # @param age_cutoff [Time] Where to stop searching for old events.
375
- # This is important to avoid a full table scale when deleting events,
376
- # since otherwise it is like 'row_updated_at < 35.days.ago'.
377
- # Since this routine should run regularly, we should rarely have events more than 35 or 36 days old,
378
- # for example.
379
- # Use +nil+ to use no limit (a full table scan) which may be necessary when running this feature
380
- # for the first time.
381
- # @param chunk_size [Integer] The row delete is done in chunks to avoid long locks.
382
- # The default seems safe, but it's exposed as a parameter if you need to play around with it,
383
- # and can be done via configuration if needed at some point.
384
- def delete_stale_cancelled_events(
385
- stale_at: Webhookdb::Icalendar.stale_cancelled_event_threshold_days.days.ago,
386
- age_cutoff: (Webhookdb::Icalendar.stale_cancelled_event_threshold_days + 10).days.ago,
387
- chunk_size: 10_000
388
- )
389
- # Delete in chunks, like:
390
- # DELETE from "public"."icalendar_event_v1_aaaa"
391
- # WHERE pk IN (
392
- # SELECT pk FROM "public"."icalendar_event_v1_aaaa"
393
- # WHERE row_updated_at < (now() - '35 days'::interval)
394
- # LIMIT 10000
395
- # )
396
- age = age_cutoff..stale_at
397
- self.admin_dataset do |ds|
398
- chunk_ds = ds.where(row_updated_at: age, status: "CANCELLED").select(:pk).limit(chunk_size)
399
- loop do
400
- # Due to conflicts where a feed is being inserted while the delete is happening,
401
- # this may raise an error like:
402
- # deadlock detected
403
- # DETAIL: Process 18352 waits for ShareLock on transaction 435085606; blocked by process 24191.
404
- # Process 24191 waits for ShareLock on transaction 435085589; blocked by process 18352.
405
- # HINT: See server log for query details.
406
- # CONTEXT: while deleting tuple (2119119,3) in relation "icalendar_event_v1_aaaa"
407
- # Unit testing this is very difficult though, and in practice it is rare,
408
- # and normal Sidekiq job retries should be sufficient to handle this.
409
- # So we don't explicitly handle deadlocks, but could if it becomes an issue.
410
- deleted = ds.where(pk: chunk_ds).delete
411
- break if deleted != chunk_size
412
- end
413
- end
383
+ class StaleRowDeleter < Webhookdb::Replicator::BaseStaleRowDeleter
384
+ def stale_at = Webhookdb::Icalendar.stale_cancelled_event_threshold_days.days
385
+ def lookback_window = Webhookdb::Icalendar.stale_cancelled_event_lookback_days.days
386
+ def updated_at_column = :row_updated_at
387
+ def stale_condition = {status: "CANCELLED"}
414
388
  end
415
389
 
390
+ def stale_row_deleter = StaleRowDeleter.new(self)
391
+
416
392
  def calculate_webhook_state_machine
417
393
  if (step = self.calculate_dependency_state_machine_step(dependency_help: ""))
418
394
  return step
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "webhookdb/replicator/icalendar_event_v1"
4
+ require "webhookdb/replicator/partitionable_mixin"
5
+
6
+ class Webhookdb::Replicator::IcalendarEventV1Partitioned < Webhookdb::Replicator::IcalendarEventV1
7
+ include Webhookdb::Replicator::PartitionableMixin
8
+
9
+ # @return [Webhookdb::Replicator::Descriptor]
10
+ def self.descriptor
11
+ return Webhookdb::Replicator::Descriptor.new(
12
+ name: "icalendar_event_v1_partitioned",
13
+ ctor: ->(sint) { self.new(sint) },
14
+ dependency_descriptor: Webhookdb::Replicator::IcalendarCalendarV1.descriptor,
15
+ feature_roles: ["partitioning_beta"],
16
+ resource_name_singular: "iCalendar Event",
17
+ supports_webhooks: true,
18
+ description: "Individual events in an icalendar, using partitioned tables rather than one big table. " \
19
+ "See icalendar_calendar_v1.",
20
+ api_docs_url: "https://icalendar.org/",
21
+ )
22
+ end
23
+
24
+ def _denormalized_columns
25
+ d = super
26
+ d << Webhookdb::Replicator::Column.new(:calendar_external_hash, INTEGER, optional: true)
27
+ return d
28
+ end
29
+
30
+ def partition_method = Webhookdb::DBAdapter::Partitioning::HASH
31
+ def partition_column_name = :calendar_external_hash
32
+ def partition_value(resource) = self._str2inthash(resource.fetch("calendar_external_id"))
33
+ end
@@ -43,6 +43,7 @@ class Webhookdb::Replicator::IntercomContactV1 < Webhookdb::Replicator::Base
43
43
  end
44
44
 
45
45
  def _mixin_backfill_url = "https://api.intercom.io/contacts"
46
+ def _mixin_backfill_hashkey = "data"
46
47
 
47
48
  def _resource_and_event(request)
48
49
  resource, event = super
@@ -40,6 +40,7 @@ class Webhookdb::Replicator::IntercomConversationV1 < Webhookdb::Replicator::Bas
40
40
  end
41
41
 
42
42
  def _mixin_backfill_url = "https://api.intercom.io/conversations"
43
+ def _mixin_backfill_hashkey = "conversations"
43
44
 
44
45
  def _resource_and_event(request)
45
46
  resource, event = super
@@ -73,8 +73,9 @@ module Webhookdb::Replicator::IntercomV1Mixin
73
73
  end
74
74
 
75
75
  def _mixin_backfill_url = raise NotImplementedError
76
+ def _mixin_backfill_hashkey = raise NotImplementedError
76
77
 
77
- def _fetch_backfill_page(pagination_token, **_kwargs)
78
+ def _fetch_backfill_page(pagination_token, last_backfilled:)
78
79
  unless self.auth_credentials?
79
80
  raise Webhookdb::Replicator::CredentialsMissing,
80
81
  "This integration requires that the Intercom Auth integration has a valid Auth Token"
@@ -123,8 +124,29 @@ module Webhookdb::Replicator::IntercomV1Mixin
123
124
  # a TypeError in the backfiller.
124
125
  return [], nil
125
126
  end
126
- data = response.parsed_response.fetch("data", [])
127
+ data = response.parsed_response.fetch(self._mixin_backfill_hashkey)
127
128
  starting_after = response.parsed_response.dig("pages", "next", "starting_after")
129
+ # Intercom pagination sorts by updated_at newest. So if we are doing an incremental sync (last_backfilled set),
130
+ # and we last backfilled after the latest updated_at, we can stop paginating.
131
+ if last_backfilled && data.last && data.last["updated_at"]
132
+ oldest_update = Time.at(data.last["updated_at"])
133
+ starting_after = nil if oldest_update < last_backfilled
134
+ end
128
135
  return data, starting_after
129
136
  end
137
+
138
+ def _backfillers
139
+ return [Backfiller.new(self)]
140
+ end
141
+
142
+ class Backfiller < Webhookdb::Replicator::Base::ServiceBackfiller
143
+ include Webhookdb::Backfiller::Bulk
144
+
145
+ # Upsert for each API call
146
+ def upsert_page_size = Webhookdb::Intercom.page_size
147
+ def prepare_body(_body) = nil
148
+ def upserting_replicator = self.svc
149
+ # We don't want to override newer items from webhooks, so use conditional upsert.
150
+ def conditional_upsert? = true
151
+ end
130
152
  end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Mixin for replicators that support partitioning.
4
+ # Partitioning is currently in beta,
5
+ # with the following limitations/context:
6
+ #
7
+ # - They cannot be created from the CLI.
8
+ # Because the partitions must be created during the CREATE TABLE call,
9
+ # the partition_value must be set immediately on creation,
10
+ # or CREATE TABLE must be deferred.
11
+ # - CLI support would also require making sure this field isn't edited.
12
+ # This is an annoying change, so we're putting it off for now.
13
+ # - Instead, partitioned replicators must be created in the console.
14
+ # - The number of HASH partitions cannot be changed;
15
+ # there is no good way to handle this in Postgres so we don't bother here.
16
+ # - RANGE partitions are not supported.
17
+ # We need to support creating the partition when the INSERT fails.
18
+ # But creating the partitioned table definition itself does work/has a shared behavior at least.
19
+ # - Existing replicators cannot be converted to partitioned.
20
+ # This is theoretically possible, but it seems easier to just start over
21
+ # with a new replicator.
22
+ # - Instead:
23
+ # - If this is a 'child' replicator, then create a new parent and this child,
24
+ # then copy over the parent data, either directly (for icalendar)
25
+ # or using HTTP requests (like with Plaid or Google) where more logic is required.
26
+ # - Otherwise, it'll depend on the replicator.
27
+ # - Then to switch clients using the old replicator, to the new replicator, you can:
28
+ # - Then turn off all workers.
29
+ # - Rename the new table to the old, and old table to the new.
30
+ # - Update the service integrations, so the old one points to the new table name and opaque id,
31
+ # and the new one points to the old table name and opaque id.
32
+ #
33
+ module Webhookdb::Replicator::PartitionableMixin
34
+ # The partition method, like Webhookdb::DBAdapter::Partitioning::HASH
35
+ def partition_method = raise NotImplementedError
36
+ # The partition column name.
37
+ # Must be present in +_denormalized_columns+.
38
+ # @return [Symbol]
39
+ def partition_column_name = raise NotImplementedError
40
+ # The value for the denormalized column. For HASH partitioning this would be an integer,
41
+ # for RANGE partitioning this could be a timestamp, etc.
42
+ # Takes the resource and returns the value.
43
+ def partition_value(_resource) = raise NotImplementedError
44
+
45
+ def partition? = true
46
+
47
+ def partitioning
48
+ return Webhookdb::DBAdapter::Partitioning.new(by: self.partition_method, column: self.partition_column_name)
49
+ end
50
+
51
+ def _prepare_for_insert(resource, event, request, enrichment)
52
+ h = super
53
+ h[self.partition_column_name] = self.partition_value(resource)
54
+ return h
55
+ end
56
+
57
+ def _upsert_conflict_target
58
+ return [self.partition_column_name, self._remote_key_column.name]
59
+ end
60
+
61
+ # Convert the given string into a stable MD5-derived hash
62
+ # that can be stored in a (signed, 4 bit) INTEGER column.
63
+ def _str2inthash(s)
64
+ # MD5 is 128 bits/16 bytes/32 hex chars (2 chars per byte).
65
+ # Integers are 32 bits/4 bytes/8 hex chars.
66
+ # Grab the first 8 chars and convert it to an integer.
67
+ unsigned_md5int = Digest::MD5.hexdigest(s)[..8].to_i(16)
68
+ # Then AND it with a 32 bit bitmask to make sure it fits in 32 bits
69
+ # (though I'm not entirely sure why the above doesn't result in 32 bits always).
70
+ unsigned_int32 = unsigned_md5int & 0xFFFFFFFF
71
+ # Convert it from unsigned (0 to 4.2B) to signed (-2.1B to 2.1B) by subtracting 2.1B
72
+ # (the max 2 byte integer), as opposed to a 4 byte integer which we're dealing with here.
73
+ signed_md5int = unsigned_int32 - MAX_16BIT_INT
74
+ return signed_md5int
75
+ end
76
+
77
+ MAX_16BIT_INT = 2**31
78
+
79
+ # Return the partitions belonging to the table.
80
+ # @param db The organization connection.
81
+ # @return [Array<Webhookdb::DBAdapter::Partition>]
82
+ def existing_partitions(db)
83
+ # SELECT inhrelid::regclass AS child
84
+ # FROM pg_catalog.pg_inherits
85
+ # WHERE inhparent = 'my_schema.foo'::regclass;
86
+ parent = self.schema_and_table_symbols.map(&:to_s).join(".")
87
+ partnames = db[Sequel[:pg_catalog][:pg_inherits]].
88
+ where(inhparent: Sequel[parent].cast(:regclass)).
89
+ select_map(Sequel[:inhrelid].cast(:regclass))
90
+ parent_table = self.dbadapter_table
91
+ result = partnames.map do |part|
92
+ suffix = self.partition_suffix(part)
93
+ Webhookdb::DBAdapter::Partition.new(parent_table:, partition_name: part.to_sym, suffix:)
94
+ end
95
+ return result
96
+ end
97
+
98
+ def partition_suffix(partname)
99
+ return partname[/_[a-zA-Z\d]+$/].to_sym
100
+ end
101
+
102
+ def partition_align_name
103
+ tblname = self.service_integration.table_name
104
+ self.service_integration.organization.admin_connection do |db|
105
+ partitions = self.existing_partitions(db)
106
+ db.transaction do
107
+ partitions.each do |partition|
108
+ next if partition.partition_name.to_s.start_with?(tblname)
109
+ schema = partition.parent_table.schema.name
110
+ new_partname = "#{tblname}#{partition.suffix}"
111
+ db << "ALTER TABLE #{schema}.#{partition.partition_name} RENAME TO #{new_partname}"
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
@@ -45,7 +45,7 @@ module Webhookdb::Replicator::ShopifyV1Mixin
45
45
  field = "api_url"
46
46
  value = "https://#{value}.myshopify.com"
47
47
  end
48
- return super(field, value)
48
+ return super
49
49
  end
50
50
 
51
51
  def calculate_webhook_state_machine
@@ -52,7 +52,7 @@ class Webhookdb::Replicator::SignalwireMessageV1 < Webhookdb::Replicator::Base
52
52
  h = u.host.gsub(/\.signalwire\.com$/, "")
53
53
  value = h
54
54
  end
55
- return super(field, value, attr:)
55
+ return super
56
56
  end
57
57
 
58
58
  def calculate_backfill_state_machine
@@ -200,7 +200,6 @@ Press 'Show' next to the newly-created API token, and copy it.)
200
200
  request_url = e.uri.to_s
201
201
  request_method = e.http_method
202
202
  end
203
- self.logger.warn("signalwire_backfill_error", response_body:, response_status:, request_url:)
204
203
  message = Webhookdb::Messages::ErrorGenericBackfill.new(
205
204
  self.service_integration,
206
205
  response_status:,
@@ -99,7 +99,7 @@ module Webhookdb::Replicator::SponsyV1Mixin
99
99
  )
100
100
  rescue Webhookdb::Http::Error => e
101
101
  raise e unless e.status == 404
102
- self.logger.warn("sponsy_404", error: e)
102
+ self.logger.warn("sponsy_404", e)
103
103
  return [], nil
104
104
  end
105
105