source_monitor 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/commands/release.md +101 -58
- data/.claude/skills/sm-configure/SKILL.md +13 -2
- data/.claude/skills/sm-configure/reference/configuration-reference.md +33 -0
- data/.claude/skills/sm-host-setup/SKILL.md +15 -1
- data/.claude/skills/sm-host-setup/reference/setup-checklist.md +33 -0
- data/.claude/skills/sm-job/SKILL.md +1 -1
- data/.vbw-planning/REQUIREMENTS.md +22 -0
- data/.vbw-planning/ROADMAP.md +125 -0
- data/.vbw-planning/STATE.md +43 -0
- data/.vbw-planning/config.json +3 -1
- data/.vbw-planning/discovery.json +3 -1
- data/.vbw-planning/phases/01-generator-steps/01-CONTEXT.md +33 -0
- data/.vbw-planning/phases/01-generator-steps/01-VERIFICATION.md +86 -0
- data/.vbw-planning/phases/01-generator-steps/PLAN-01-SUMMARY.md +61 -0
- data/.vbw-planning/phases/01-generator-steps/PLAN-01.md +380 -0
- data/.vbw-planning/phases/02-verification/02-VERIFICATION.md +78 -0
- data/.vbw-planning/phases/02-verification/PLAN-01-SUMMARY.md +46 -0
- data/.vbw-planning/phases/02-verification/PLAN-01.md +500 -0
- data/.vbw-planning/phases/03-docs-alignment/03-VERIFICATION.md +89 -0
- data/.vbw-planning/phases/03-docs-alignment/PLAN-01-SUMMARY.md +48 -0
- data/.vbw-planning/phases/03-docs-alignment/PLAN-01.md +456 -0
- data/.vbw-planning/phases/04-dashboard-ux/04-VERIFICATION.md +129 -0
- data/.vbw-planning/phases/04-dashboard-ux/PLAN-01-SUMMARY.md +70 -0
- data/.vbw-planning/phases/04-dashboard-ux/PLAN-01.md +747 -0
- data/.vbw-planning/phases/05-active-storage-images/05-VERIFICATION.md +156 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-01-SUMMARY.md +69 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-01.md +455 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-02-SUMMARY.md +39 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-02.md +488 -0
- data/.vbw-planning/phases/06-netflix-feed-fix/06-VERIFICATION.md +100 -0
- data/.vbw-planning/phases/06-netflix-feed-fix/PLAN-01-SUMMARY.md +37 -0
- data/.vbw-planning/phases/06-netflix-feed-fix/PLAN-01.md +345 -0
- data/CHANGELOG.md +31 -0
- data/Gemfile.lock +1 -1
- data/VERSION +1 -1
- data/app/assets/builds/source_monitor/application.css +9 -0
- data/app/helpers/source_monitor/application_helper.rb +38 -0
- data/app/jobs/source_monitor/download_content_images_job.rb +72 -0
- data/app/models/source_monitor/item_content.rb +2 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +9 -0
- data/app/views/source_monitor/items/_details.html.erb +2 -2
- data/app/views/source_monitor/logs/index.html.erb +9 -0
- data/app/views/source_monitor/sources/_details.html.erb +2 -2
- data/app/views/source_monitor/sources/_row.html.erb +1 -1
- data/docs/setup.md +10 -1
- data/docs/troubleshooting.md +38 -7
- data/lib/generators/source_monitor/install/install_generator.rb +101 -0
- data/lib/source_monitor/configuration/http_settings.rb +7 -1
- data/lib/source_monitor/configuration/images_settings.rb +37 -0
- data/lib/source_monitor/configuration.rb +3 -1
- data/lib/source_monitor/dashboard/queries/recent_activity_query.rb +16 -7
- data/lib/source_monitor/dashboard/recent_activity.rb +1 -0
- data/lib/source_monitor/dashboard/recent_activity_presenter.rb +15 -2
- data/lib/source_monitor/fetching/feed_fetcher/entry_processor.rb +13 -0
- data/lib/source_monitor/http.rb +23 -0
- data/lib/source_monitor/images/content_rewriter.rb +81 -0
- data/lib/source_monitor/images/downloader.rb +82 -0
- data/lib/source_monitor/logs/table_presenter.rb +25 -0
- data/lib/source_monitor/setup/procfile_patcher.rb +31 -0
- data/lib/source_monitor/setup/queue_config_patcher.rb +84 -0
- data/lib/source_monitor/setup/verification/recurring_schedule_verifier.rb +102 -0
- data/lib/source_monitor/setup/verification/runner.rb +1 -1
- data/lib/source_monitor/setup/verification/solid_queue_verifier.rb +1 -1
- data/lib/source_monitor/setup/workflow.rb +10 -0
- data/lib/source_monitor/version.rb +1 -1
- data/lib/source_monitor.rb +8 -0
- metadata +31 -1
data/docs/setup.md
CHANGED
|
@@ -49,7 +49,12 @@ This ensures Bundler can load SourceMonitor so the commands below are available.
|
|
|
49
49
|
```bash
|
|
50
50
|
bin/rails solid_queue:start
|
|
51
51
|
```
|
|
52
|
-
|
|
52
|
+
The install generator automatically handles all worker configuration:
|
|
53
|
+
- **Recurring jobs** are configured in `config/recurring.yml` (fetch scheduling, scraping, cleanup).
|
|
54
|
+
- **Procfile.dev** is patched with a `jobs:` entry so `bin/dev` starts Solid Queue alongside the web server.
|
|
55
|
+
- **Queue dispatcher** is patched with `recurring_schedule: config/recurring.yml` in `config/queue.yml` so recurring jobs load on startup.
|
|
56
|
+
|
|
57
|
+
All three steps are idempotent. If any configuration is missing, re-run: `bin/rails generate source_monitor:install`
|
|
53
58
|
|
|
54
59
|
4. **Visit the dashboard** at the chosen mount path, create a source, and trigger “Fetch Now” to validate realtime updates and Solid Queue processing.
|
|
55
60
|
|
|
@@ -87,6 +92,8 @@ Prefer to script each step or plug SourceMonitor into an existing deployment che
|
|
|
87
92
|
| 4 | `bin/rails railties:install:migrations FROM=source_monitor` | Copy engine migrations (idempotent) |
|
|
88
93
|
| 5 | `bin/rails db:migrate` | Apply schema updates, including Solid Queue tables |
|
|
89
94
|
| 6 | `bin/rails solid_queue:start` | Ensure jobs process via Solid Queue |
|
|
95
|
+
| 6a | Handled by generator (patches `Procfile.dev`) | Ensure `bin/dev` starts Solid Queue workers |
|
|
96
|
+
| 6b | Handled by generator (patches `config/queue.yml`) | Wire recurring jobs into Solid Queue dispatcher |
|
|
90
97
|
| 7 | `bin/jobs --recurring_schedule_file=config/recurring.yml` | Start recurring scheduler (optional but recommended) |
|
|
91
98
|
| 8 | `bin/source_monitor verify` | Confirm Solid Queue/Action Cable readiness and emit telemetry |
|
|
92
99
|
|
|
@@ -100,6 +107,8 @@ Prefer to script each step or plug SourceMonitor into an existing deployment che
|
|
|
100
107
|
4. **Apply database changes** using `bin/rails db:migrate`. If your host already installed Solid Queue migrations manually, delete duplicate files before migrating.
|
|
101
108
|
5. **Wire Action Cable** if necessary. SourceMonitor defaults to Solid Cable; confirm `ApplicationCable::Connection`/`Channel` exist and that `config/initializers/source_monitor.rb` uses the adapter you expect. To switch to Redis, set `config.realtime.adapter = :redis` and `config.realtime.redis_url`.
|
|
102
109
|
6. **Start workers** with `bin/rails solid_queue:start` (or your process manager). The install generator automatically configures recurring jobs in `config/recurring.yml` for fetch scheduling, scraping, and cleanup. They'll run with `bin/dev` or `bin/jobs`.
|
|
110
|
+
- **Procfile.dev:** The generator automatically patches `Procfile.dev` with a `jobs:` entry for Solid Queue. Verify the file contains `jobs: bundle exec rake solid_queue:start` after running the generator.
|
|
111
|
+
- **Recurring schedule:** The generator automatically patches `config/queue.yml` dispatchers with `recurring_schedule: config/recurring.yml`. Verify the key is present after running the generator.
|
|
103
112
|
7. **Review the initializer** and tune queue names, HTTP timeouts, scraping adapters, retention limits, authentication hooks, and Mission Control integration. The [configuration reference](configuration.md) details every option.
|
|
104
113
|
8. **Verify the install**: run `bin/source_monitor verify` to ensure Solid Queue workers and Action Cable are healthy, then visit the mount path to trigger a fetch manually. Enable telemetry if you want JSON logs recorded for support.
|
|
105
114
|
|
data/docs/troubleshooting.md
CHANGED
|
@@ -20,45 +20,76 @@ This guide lists common issues you might encounter while installing, upgrading,
|
|
|
20
20
|
- Ensure at least one Solid Queue worker is running; the dashboard reads visibility data via `SourceMonitor::Jobs::Visibility`.
|
|
21
21
|
- When using mission control integration, keep `config.mission_control_dashboard_path` pointing at a valid route helper; otherwise the dashboard hides the link.
|
|
22
22
|
|
|
23
|
-
## 4.
|
|
23
|
+
## 4. Recurring Jobs Not Running
|
|
24
|
+
|
|
25
|
+
- **Symptoms:** Fetch scheduling, scrape scheduling, and cleanup jobs never fire. Sources never auto-fetch on their configured intervals.
|
|
26
|
+
- **Primary fix:** Re-run the install generator, which automatically patches the dispatcher config:
|
|
27
|
+
```bash
|
|
28
|
+
bin/rails generate source_monitor:install
|
|
29
|
+
```
|
|
30
|
+
- **Diagnostics:** Run `bin/source_monitor verify` to check recurring task registration. The RecurringScheduleVerifier will report whether SourceMonitor recurring tasks are loaded into Solid Queue.
|
|
31
|
+
- **Manual check:** Verify `config/queue.yml` includes `recurring_schedule: config/recurring.yml` under the `dispatchers:` section. Without this key, Solid Queue's dispatcher will not load the recurring schedule even though `config/recurring.yml` exists.
|
|
32
|
+
- **Manual fix (if generator cannot patch):**
|
|
33
|
+
```yaml
|
|
34
|
+
dispatchers:
|
|
35
|
+
- polling_interval: 1
|
|
36
|
+
batch_size: 500
|
|
37
|
+
recurring_schedule: config/recurring.yml
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## 5. Jobs Not Processing with bin/dev
|
|
41
|
+
|
|
42
|
+
- **Symptoms:** `bin/dev` starts the web server but jobs never run. Running `bin/rails solid_queue:start` manually works fine.
|
|
43
|
+
- **Primary fix:** Re-run the install generator, which automatically patches `Procfile.dev`:
|
|
44
|
+
```bash
|
|
45
|
+
bin/rails generate source_monitor:install
|
|
46
|
+
```
|
|
47
|
+
- **Diagnostics:** Run `bin/source_monitor verify` to check Solid Queue worker status. The SolidQueueVerifier will suggest Procfile.dev if no workers are detected.
|
|
48
|
+
- **Manual check:** Verify `Procfile.dev` includes a `jobs:` line:
|
|
49
|
+
```
|
|
50
|
+
jobs: bundle exec rake solid_queue:start
|
|
51
|
+
```
|
|
52
|
+
- Most Rails 8 apps use foreman or overmind via `bin/dev`. Without a `jobs:` entry, the process manager only starts the web server and asset watchers -- Solid Queue workers are not launched.
|
|
53
|
+
|
|
54
|
+
## 6. Realtime Updates Do Not Stream
|
|
24
55
|
|
|
25
56
|
- Confirm Action Cable is mounted and `ApplicationCable` classes exist (see installation guide).
|
|
26
57
|
- In production, verify WebSocket proxy settings allow the `/cable` endpoint.
|
|
27
58
|
- When switching to Redis, add `config.realtime.adapter = :redis` and `config.realtime.redis_url` in the initializer, then restart web and worker processes.
|
|
28
59
|
- For Solid Cable, check that the `solid_cable_messages` table exists and that no other process clears it unexpectedly.
|
|
29
60
|
|
|
30
|
-
##
|
|
61
|
+
## 7. Fetch Jobs Keep Failing
|
|
31
62
|
|
|
32
63
|
- Review the most recent fetch log entry for the source; it stores the HTTP status, error class, and error message.
|
|
33
64
|
- Increase `config.http.timeout` or `config.http.retry_max` if the feed is slow or prone to transient errors.
|
|
34
65
|
- Supply custom headers or basic auth credentials via the source form when feeds require authentication.
|
|
35
66
|
- Check for TLS issues on self-signed feeds; you may need to configure Faraday with custom SSL options.
|
|
36
67
|
|
|
37
|
-
##
|
|
68
|
+
## 8. Scraping Returns "Failed"
|
|
38
69
|
|
|
39
70
|
- Confirm the source has scraping enabled and the configured adapter exists.
|
|
40
71
|
- Override selectors in the source's scrape settings if the default Readability extraction misses key elements.
|
|
41
72
|
- Inspect the scrape log to see the adapter status and content length. Logs store the HTTP status and any exception raised by the adapter.
|
|
42
73
|
- Retry manually from the item detail page after fixing selectors.
|
|
43
74
|
|
|
44
|
-
##
|
|
75
|
+
## 9. Cleanup Rake Tasks Fail
|
|
45
76
|
|
|
46
77
|
- Pass numeric values for `FETCH_LOG_DAYS` or `SCRAPE_LOG_DAYS` environment variables (e.g., `FETCH_LOG_DAYS=30`).
|
|
47
78
|
- Ensure workers or the console environment have permission to soft delete (`SOFT_DELETE=true`) if you expect tombstones.
|
|
48
79
|
- If job classes cannot load, verify `SourceMonitor.configure` ran before calling `rake source_monitor:cleanup:*`.
|
|
49
80
|
|
|
50
|
-
##
|
|
81
|
+
## 10. Test Suite Cannot Launch a Browser
|
|
51
82
|
|
|
52
83
|
- System tests rely on Selenium + Chrome. Install Chrome/Chromium and set `SELENIUM_CHROME_BINARY` if the binary lives in a non-standard path.
|
|
53
84
|
- You can run `rbenv exec bin/test-coverage --verbose` to inspect failures with additional logging.
|
|
54
85
|
|
|
55
|
-
##
|
|
86
|
+
## 11. Mission Control Jobs Link Returns 404
|
|
56
87
|
|
|
57
88
|
- Mount `MissionControl::Jobs::Engine` in your host routes (for example, `mount MissionControl::Jobs::Engine, at: "/mission_control"`).
|
|
58
89
|
- Keep `config.mission_control_enabled = true` **and** `config.mission_control_dashboard_path` pointing at that mounted route helper. Call `SourceMonitor.mission_control_dashboard_path` in the Rails console to confirm it resolves.
|
|
59
90
|
- When hosting Mission Control in a separate app, provide a full URL instead of a route helper and ensure CORS/WebSocket settings allow the dashboard iframe.
|
|
60
91
|
|
|
61
|
-
##
|
|
92
|
+
## 12. Tailwind Build Fails or Admin UI Loads Without Styles
|
|
62
93
|
|
|
63
94
|
- Running `test/dummy/bin/dev` before configuring the bundling pipeline will serve the admin UI without Tailwind styles or Stimulus behaviours. This happens because the engine no longer ships precompiled assets; see `.ai/engine-asset-configuration.md:11-44` for the required npm setup.
|
|
64
95
|
- Fix by running `npm install` followed by `npm run build` inside the engine root so that `app/assets/builds/source_monitor/application.css` and `application.js` exist. The Rake task `app:source_monitor:assets:build` wraps the same scripts for CI usage.
|
|
@@ -49,7 +49,48 @@ module SourceMonitor
|
|
|
49
49
|
end
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
+
def patch_procfile_dev
|
|
53
|
+
procfile_path = File.join(destination_root, "Procfile.dev")
|
|
54
|
+
|
|
55
|
+
if File.exist?(procfile_path)
|
|
56
|
+
content = File.read(procfile_path)
|
|
57
|
+
if content.match?(/^jobs:/)
|
|
58
|
+
say_status :skip, "Procfile.dev (jobs entry already present)", :yellow
|
|
59
|
+
return
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
File.open(procfile_path, "a") { |f| f.puts("", PROCFILE_JOBS_ENTRY) }
|
|
63
|
+
say_status :append, "Procfile.dev", :green
|
|
64
|
+
else
|
|
65
|
+
File.write(procfile_path, "web: bin/rails server -p 3000\n#{PROCFILE_JOBS_ENTRY}\n")
|
|
66
|
+
say_status :create, "Procfile.dev", :green
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def configure_queue_dispatcher
|
|
71
|
+
queue_path = File.join(destination_root, "config/queue.yml")
|
|
72
|
+
|
|
73
|
+
unless File.exist?(queue_path)
|
|
74
|
+
say_status :skip, "config/queue.yml (file not found — create it or run rails app:update to generate)", :yellow
|
|
75
|
+
return
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
parsed = YAML.safe_load(File.read(queue_path), aliases: true) || {}
|
|
79
|
+
|
|
80
|
+
if queue_config_has_recurring_schedule?(parsed)
|
|
81
|
+
say_status :skip, "config/queue.yml (recurring_schedule already configured)", :yellow
|
|
82
|
+
return
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
add_recurring_schedule_to_dispatchers!(parsed)
|
|
86
|
+
File.write(queue_path, YAML.dump(parsed))
|
|
87
|
+
say_status :append, "config/queue.yml (added recurring_schedule to dispatchers)", :green
|
|
88
|
+
end
|
|
89
|
+
|
|
52
90
|
def print_next_steps
|
|
91
|
+
say_status :info,
|
|
92
|
+
"Procfile.dev configured — run bin/dev to start both web server and Solid Queue workers.",
|
|
93
|
+
:green
|
|
53
94
|
say_status :info,
|
|
54
95
|
"Recurring jobs configured in config/recurring.yml — they'll run automatically with bin/dev or bin/jobs.",
|
|
55
96
|
:green
|
|
@@ -60,6 +101,8 @@ module SourceMonitor
|
|
|
60
101
|
|
|
61
102
|
private
|
|
62
103
|
|
|
104
|
+
PROCFILE_JOBS_ENTRY = "jobs: bundle exec rake solid_queue:start"
|
|
105
|
+
|
|
63
106
|
RECURRING_ENTRIES = {
|
|
64
107
|
"source_monitor_schedule_fetches" => {
|
|
65
108
|
"class" => "SourceMonitor::ScheduleFetchesJob",
|
|
@@ -154,6 +197,64 @@ module SourceMonitor
|
|
|
154
197
|
path = (raw_path && !raw_path.strip.empty?) ? raw_path.strip : "/source_monitor"
|
|
155
198
|
path.start_with?("/") ? path : "/#{path}"
|
|
156
199
|
end
|
|
200
|
+
|
|
201
|
+
RECURRING_SCHEDULE_VALUE = "config/recurring.yml"
|
|
202
|
+
|
|
203
|
+
DEFAULT_DISPATCHER = {
|
|
204
|
+
"polling_interval" => 1,
|
|
205
|
+
"batch_size" => 500,
|
|
206
|
+
"recurring_schedule" => RECURRING_SCHEDULE_VALUE
|
|
207
|
+
}.freeze
|
|
208
|
+
|
|
209
|
+
def queue_config_has_recurring_schedule?(parsed)
|
|
210
|
+
parsed.each_value do |value|
|
|
211
|
+
next unless value.is_a?(Hash)
|
|
212
|
+
|
|
213
|
+
dispatchers = value["dispatchers"] || value[:dispatchers]
|
|
214
|
+
if dispatchers.is_a?(Array)
|
|
215
|
+
return true if dispatchers.any? { |d| d.is_a?(Hash) && d.key?("recurring_schedule") }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
return true if queue_config_has_recurring_schedule?(value)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Check top-level dispatchers (flat config)
|
|
222
|
+
if parsed.key?("dispatchers") && parsed["dispatchers"].is_a?(Array)
|
|
223
|
+
return true if parsed["dispatchers"].any? { |d| d.is_a?(Hash) && d.key?("recurring_schedule") }
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
false
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def add_recurring_schedule_to_dispatchers!(parsed)
|
|
230
|
+
found_dispatchers = false
|
|
231
|
+
|
|
232
|
+
parsed.each_value do |value|
|
|
233
|
+
next unless value.is_a?(Hash)
|
|
234
|
+
|
|
235
|
+
if value.key?("dispatchers") && value["dispatchers"].is_a?(Array)
|
|
236
|
+
value["dispatchers"].each do |dispatcher|
|
|
237
|
+
next unless dispatcher.is_a?(Hash)
|
|
238
|
+
dispatcher["recurring_schedule"] ||= RECURRING_SCHEDULE_VALUE
|
|
239
|
+
end
|
|
240
|
+
found_dispatchers = true
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Check top-level dispatchers (flat config)
|
|
245
|
+
if parsed.key?("dispatchers") && parsed["dispatchers"].is_a?(Array)
|
|
246
|
+
parsed["dispatchers"].each do |dispatcher|
|
|
247
|
+
next unless dispatcher.is_a?(Hash)
|
|
248
|
+
dispatcher["recurring_schedule"] ||= RECURRING_SCHEDULE_VALUE
|
|
249
|
+
end
|
|
250
|
+
found_dispatchers = true
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# No dispatchers found at all — add a default section
|
|
254
|
+
unless found_dispatchers
|
|
255
|
+
parsed["dispatchers"] = [ DEFAULT_DISPATCHER.dup ]
|
|
256
|
+
end
|
|
257
|
+
end
|
|
157
258
|
end
|
|
158
259
|
end
|
|
159
260
|
end
|
|
@@ -13,7 +13,10 @@ module SourceMonitor
|
|
|
13
13
|
:retry_interval,
|
|
14
14
|
:retry_interval_randomness,
|
|
15
15
|
:retry_backoff_factor,
|
|
16
|
-
:retry_statuses
|
|
16
|
+
:retry_statuses,
|
|
17
|
+
:ssl_ca_file,
|
|
18
|
+
:ssl_ca_path,
|
|
19
|
+
:ssl_verify
|
|
17
20
|
|
|
18
21
|
def initialize
|
|
19
22
|
reset!
|
|
@@ -31,6 +34,9 @@ module SourceMonitor
|
|
|
31
34
|
@retry_interval_randomness = 0.5
|
|
32
35
|
@retry_backoff_factor = 2
|
|
33
36
|
@retry_statuses = nil
|
|
37
|
+
@ssl_ca_file = nil
|
|
38
|
+
@ssl_ca_path = nil
|
|
39
|
+
@ssl_verify = true
|
|
34
40
|
end
|
|
35
41
|
|
|
36
42
|
private
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class Configuration
|
|
5
|
+
class ImagesSettings
|
|
6
|
+
attr_accessor :download_to_active_storage,
|
|
7
|
+
:max_download_size,
|
|
8
|
+
:download_timeout,
|
|
9
|
+
:allowed_content_types
|
|
10
|
+
|
|
11
|
+
DEFAULT_MAX_DOWNLOAD_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
12
|
+
DEFAULT_DOWNLOAD_TIMEOUT = 30 # seconds
|
|
13
|
+
DEFAULT_ALLOWED_CONTENT_TYPES = %w[
|
|
14
|
+
image/jpeg
|
|
15
|
+
image/png
|
|
16
|
+
image/gif
|
|
17
|
+
image/webp
|
|
18
|
+
image/svg+xml
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
def initialize
|
|
22
|
+
reset!
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def reset!
|
|
26
|
+
@download_to_active_storage = false
|
|
27
|
+
@max_download_size = DEFAULT_MAX_DOWNLOAD_SIZE
|
|
28
|
+
@download_timeout = DEFAULT_DOWNLOAD_TIMEOUT
|
|
29
|
+
@allowed_content_types = DEFAULT_ALLOWED_CONTENT_TYPES.dup
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def download_enabled?
|
|
33
|
+
!!download_to_active_storage
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -8,6 +8,7 @@ require "source_monitor/configuration/scraping_settings"
|
|
|
8
8
|
require "source_monitor/configuration/realtime_settings"
|
|
9
9
|
require "source_monitor/configuration/retention_settings"
|
|
10
10
|
require "source_monitor/configuration/authentication_settings"
|
|
11
|
+
require "source_monitor/configuration/images_settings"
|
|
11
12
|
require "source_monitor/configuration/scraper_registry"
|
|
12
13
|
require "source_monitor/configuration/events"
|
|
13
14
|
require "source_monitor/configuration/validation_definition"
|
|
@@ -26,7 +27,7 @@ module SourceMonitor
|
|
|
26
27
|
:mission_control_enabled,
|
|
27
28
|
:mission_control_dashboard_path
|
|
28
29
|
|
|
29
|
-
attr_reader :http, :scrapers, :retention, :events, :models, :realtime, :fetching, :health, :authentication, :scraping
|
|
30
|
+
attr_reader :http, :scrapers, :retention, :events, :models, :realtime, :fetching, :health, :authentication, :scraping, :images
|
|
30
31
|
|
|
31
32
|
DEFAULT_QUEUE_NAMESPACE = "source_monitor"
|
|
32
33
|
|
|
@@ -50,6 +51,7 @@ module SourceMonitor
|
|
|
50
51
|
@health = HealthSettings.new
|
|
51
52
|
@authentication = AuthenticationSettings.new
|
|
52
53
|
@scraping = ScrapingSettings.new
|
|
54
|
+
@images = ImagesSettings.new
|
|
53
55
|
end
|
|
54
56
|
|
|
55
57
|
def queue_name_for(role)
|
|
@@ -37,7 +37,8 @@ module SourceMonitor
|
|
|
37
37
|
item_title: row["item_title"],
|
|
38
38
|
item_url: row["item_url"],
|
|
39
39
|
source_name: row["source_name"],
|
|
40
|
-
source_id: row["source_id"]
|
|
40
|
+
source_id: row["source_id"],
|
|
41
|
+
source_feed_url: row["source_feed_url"]
|
|
41
42
|
)
|
|
42
43
|
end
|
|
43
44
|
|
|
@@ -57,7 +58,8 @@ module SourceMonitor
|
|
|
57
58
|
item_title,
|
|
58
59
|
item_url,
|
|
59
60
|
source_name,
|
|
60
|
-
source_id
|
|
61
|
+
source_id,
|
|
62
|
+
source_feed_url
|
|
61
63
|
FROM (
|
|
62
64
|
#{fetch_log_sql}
|
|
63
65
|
UNION ALL
|
|
@@ -83,9 +85,12 @@ module SourceMonitor
|
|
|
83
85
|
NULL AS scraper_adapter,
|
|
84
86
|
NULL AS item_title,
|
|
85
87
|
NULL AS item_url,
|
|
86
|
-
|
|
87
|
-
#{SourceMonitor::FetchLog.quoted_table_name}.source_id AS source_id
|
|
88
|
+
#{SourceMonitor::Source.quoted_table_name}.#{quoted_source_name} AS source_name,
|
|
89
|
+
#{SourceMonitor::FetchLog.quoted_table_name}.source_id AS source_id,
|
|
90
|
+
#{SourceMonitor::Source.quoted_table_name}.feed_url AS source_feed_url
|
|
88
91
|
FROM #{SourceMonitor::FetchLog.quoted_table_name}
|
|
92
|
+
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
93
|
+
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::FetchLog.quoted_table_name}.source_id
|
|
89
94
|
SQL
|
|
90
95
|
end
|
|
91
96
|
|
|
@@ -100,12 +105,15 @@ module SourceMonitor
|
|
|
100
105
|
NULL AS items_updated,
|
|
101
106
|
#{SourceMonitor::ScrapeLog.quoted_table_name}.scraper_adapter AS scraper_adapter,
|
|
102
107
|
NULL AS item_title,
|
|
103
|
-
|
|
108
|
+
#{SourceMonitor::Item.quoted_table_name}.url AS item_url,
|
|
104
109
|
#{SourceMonitor::Source.quoted_table_name}.#{quoted_source_name} AS source_name,
|
|
105
|
-
#{SourceMonitor::ScrapeLog.quoted_table_name}.source_id AS source_id
|
|
110
|
+
#{SourceMonitor::ScrapeLog.quoted_table_name}.source_id AS source_id,
|
|
111
|
+
NULL AS source_feed_url
|
|
106
112
|
FROM #{SourceMonitor::ScrapeLog.quoted_table_name}
|
|
107
113
|
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
108
114
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::ScrapeLog.quoted_table_name}.source_id
|
|
115
|
+
LEFT JOIN #{SourceMonitor::Item.quoted_table_name}
|
|
116
|
+
ON #{SourceMonitor::Item.quoted_table_name}.id = #{SourceMonitor::ScrapeLog.quoted_table_name}.item_id
|
|
109
117
|
SQL
|
|
110
118
|
end
|
|
111
119
|
|
|
@@ -122,7 +130,8 @@ module SourceMonitor
|
|
|
122
130
|
#{SourceMonitor::Item.quoted_table_name}.title AS item_title,
|
|
123
131
|
#{SourceMonitor::Item.quoted_table_name}.url AS item_url,
|
|
124
132
|
#{SourceMonitor::Source.quoted_table_name}.#{quoted_source_name} AS source_name,
|
|
125
|
-
#{SourceMonitor::Item.quoted_table_name}.source_id AS source_id
|
|
133
|
+
#{SourceMonitor::Item.quoted_table_name}.source_id AS source_id,
|
|
134
|
+
NULL AS source_feed_url
|
|
126
135
|
FROM #{SourceMonitor::Item.quoted_table_name}
|
|
127
136
|
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
128
137
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::Item.quoted_table_name}.source_id
|
|
@@ -30,13 +30,16 @@ module SourceMonitor
|
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
def fetch_event(event)
|
|
33
|
+
domain = source_domain(event.source_feed_url)
|
|
33
34
|
{
|
|
34
35
|
label: "Fetch ##{event.id}",
|
|
35
36
|
description: "#{event.items_created.to_i} created / #{event.items_updated.to_i} updated",
|
|
36
37
|
status: event.success? ? :success : :failure,
|
|
37
38
|
type: :fetch,
|
|
38
39
|
time: event.occurred_at,
|
|
39
|
-
path: url_helpers.fetch_log_path(event.id)
|
|
40
|
+
path: url_helpers.fetch_log_path(event.id),
|
|
41
|
+
url_display: domain,
|
|
42
|
+
url_href: event.source_feed_url
|
|
40
43
|
}
|
|
41
44
|
end
|
|
42
45
|
|
|
@@ -47,10 +50,20 @@ module SourceMonitor
|
|
|
47
50
|
status: event.success? ? :success : :failure,
|
|
48
51
|
type: :scrape,
|
|
49
52
|
time: event.occurred_at,
|
|
50
|
-
path: url_helpers.scrape_log_path(event.id)
|
|
53
|
+
path: url_helpers.scrape_log_path(event.id),
|
|
54
|
+
url_display: event.item_url,
|
|
55
|
+
url_href: event.item_url
|
|
51
56
|
}
|
|
52
57
|
end
|
|
53
58
|
|
|
59
|
+
def source_domain(feed_url)
|
|
60
|
+
return nil if feed_url.blank?
|
|
61
|
+
|
|
62
|
+
URI.parse(feed_url.to_s).host
|
|
63
|
+
rescue URI::InvalidURIError
|
|
64
|
+
nil
|
|
65
|
+
end
|
|
66
|
+
|
|
54
67
|
def item_event(event)
|
|
55
68
|
{
|
|
56
69
|
label: event.item_title.presence || "New Item",
|
|
@@ -38,6 +38,7 @@ module SourceMonitor
|
|
|
38
38
|
created += 1
|
|
39
39
|
created_items << result.item
|
|
40
40
|
SourceMonitor::Events.after_item_created(item: result.item, source:, entry:, result: result)
|
|
41
|
+
enqueue_image_download(result.item)
|
|
41
42
|
else
|
|
42
43
|
updated += 1
|
|
43
44
|
updated_items << result.item
|
|
@@ -61,6 +62,18 @@ module SourceMonitor
|
|
|
61
62
|
|
|
62
63
|
private
|
|
63
64
|
|
|
65
|
+
def enqueue_image_download(item)
|
|
66
|
+
return unless SourceMonitor.config.images.download_enabled?
|
|
67
|
+
return if item.content.blank?
|
|
68
|
+
|
|
69
|
+
SourceMonitor::DownloadContentImagesJob.perform_later(item.id)
|
|
70
|
+
rescue StandardError => error
|
|
71
|
+
# Image download enqueue failure must never break feed processing
|
|
72
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
73
|
+
Rails.logger.error("[SourceMonitor] Failed to enqueue image download for item #{item.id}: #{error.message}")
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
64
77
|
def normalize_item_error(entry, error)
|
|
65
78
|
{
|
|
66
79
|
guid: safe_entry_guid(entry),
|
data/lib/source_monitor/http.rb
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "openssl"
|
|
3
4
|
require "faraday"
|
|
4
5
|
require "faraday/retry"
|
|
5
6
|
require "faraday/follow_redirects"
|
|
@@ -57,9 +58,31 @@ module SourceMonitor
|
|
|
57
58
|
connection.headers[key] = value
|
|
58
59
|
end
|
|
59
60
|
|
|
61
|
+
configure_ssl(connection, settings)
|
|
62
|
+
|
|
60
63
|
connection.adapter Faraday.default_adapter
|
|
61
64
|
end
|
|
62
65
|
|
|
66
|
+
# Configure SSL to use a proper cert store. Without this, some systems
|
|
67
|
+
# fail to verify certificate chains that depend on intermediate CAs
|
|
68
|
+
# (e.g., Medium/Netflix on AWS). OpenSSL::X509::Store#set_default_paths
|
|
69
|
+
# loads all system-trusted CAs including intermediates.
|
|
70
|
+
def configure_ssl(connection, settings)
|
|
71
|
+
connection.ssl.verify = settings.ssl_verify != false
|
|
72
|
+
|
|
73
|
+
if settings.ssl_ca_file
|
|
74
|
+
connection.ssl.ca_file = settings.ssl_ca_file
|
|
75
|
+
elsif settings.ssl_ca_path
|
|
76
|
+
connection.ssl.ca_path = settings.ssl_ca_path
|
|
77
|
+
else
|
|
78
|
+
connection.ssl.cert_store = default_cert_store
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def default_cert_store
|
|
83
|
+
OpenSSL::X509::Store.new.tap(&:set_default_paths)
|
|
84
|
+
end
|
|
85
|
+
|
|
63
86
|
def default_headers(settings)
|
|
64
87
|
base_headers = {
|
|
65
88
|
"User-Agent" => resolve_callable(settings.user_agent).presence || DEFAULT_USER_AGENT,
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokolexbor"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module SourceMonitor
|
|
7
|
+
module Images
|
|
8
|
+
class ContentRewriter
|
|
9
|
+
attr_reader :html, :base_url
|
|
10
|
+
|
|
11
|
+
def initialize(html, base_url: nil)
|
|
12
|
+
@html = html.to_s
|
|
13
|
+
@base_url = base_url
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Returns an array of absolute image URLs found in <img> tags.
|
|
17
|
+
# Skips data: URIs, blank src, and invalid URLs.
|
|
18
|
+
def image_urls
|
|
19
|
+
return [] if html.blank?
|
|
20
|
+
|
|
21
|
+
doc = parse_fragment
|
|
22
|
+
urls = []
|
|
23
|
+
|
|
24
|
+
doc.css("img[src]").each do |img|
|
|
25
|
+
url = resolve_url(img["src"])
|
|
26
|
+
urls << url if url && downloadable_url?(url)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
urls.uniq
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Rewrites <img src="..."> attributes by yielding each original URL
|
|
33
|
+
# to the block and replacing with the block's return value.
|
|
34
|
+
# Returns the rewritten HTML string.
|
|
35
|
+
# If the block returns nil, the original URL is preserved (graceful fallback).
|
|
36
|
+
def rewrite
|
|
37
|
+
return html if html.blank?
|
|
38
|
+
|
|
39
|
+
doc = parse_fragment
|
|
40
|
+
|
|
41
|
+
doc.css("img[src]").each do |img|
|
|
42
|
+
original_url = resolve_url(img["src"])
|
|
43
|
+
next unless original_url && downloadable_url?(original_url)
|
|
44
|
+
|
|
45
|
+
new_url = yield(original_url)
|
|
46
|
+
img["src"] = new_url if new_url.present?
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
doc.to_html
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def parse_fragment
|
|
55
|
+
Nokolexbor::DocumentFragment.parse(html)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def resolve_url(src)
|
|
59
|
+
src = src.to_s.strip
|
|
60
|
+
return nil if src.blank?
|
|
61
|
+
return nil if src.start_with?("data:")
|
|
62
|
+
|
|
63
|
+
uri = URI.parse(src)
|
|
64
|
+
if uri.relative? && base_url.present?
|
|
65
|
+
URI.join(base_url, src).to_s
|
|
66
|
+
elsif uri.absolute?
|
|
67
|
+
src
|
|
68
|
+
end
|
|
69
|
+
rescue URI::InvalidURIError
|
|
70
|
+
nil
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def downloadable_url?(url)
|
|
74
|
+
uri = URI.parse(url)
|
|
75
|
+
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
76
|
+
rescue URI::InvalidURIError
|
|
77
|
+
false
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "securerandom"
|
|
5
|
+
|
|
6
|
+
module SourceMonitor
|
|
7
|
+
module Images
|
|
8
|
+
class Downloader
|
|
9
|
+
Result = Struct.new(:io, :filename, :content_type, :byte_size, keyword_init: true)
|
|
10
|
+
|
|
11
|
+
attr_reader :url, :settings
|
|
12
|
+
|
|
13
|
+
def initialize(url, settings: nil)
|
|
14
|
+
@url = url
|
|
15
|
+
@settings = settings || SourceMonitor.config.images
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Downloads the image and returns a Result, or nil if download fails
|
|
19
|
+
# or the image does not meet validation criteria.
|
|
20
|
+
def call
|
|
21
|
+
response = fetch_image
|
|
22
|
+
return unless response
|
|
23
|
+
|
|
24
|
+
content_type = response.headers["content-type"]&.split(";")&.first&.strip&.downcase
|
|
25
|
+
return unless allowed_content_type?(content_type)
|
|
26
|
+
|
|
27
|
+
body = response.body
|
|
28
|
+
return unless body && body.bytesize > 0
|
|
29
|
+
return if body.bytesize > settings.max_download_size
|
|
30
|
+
|
|
31
|
+
filename = derive_filename(url, content_type)
|
|
32
|
+
|
|
33
|
+
Result.new(
|
|
34
|
+
io: StringIO.new(body),
|
|
35
|
+
filename: filename,
|
|
36
|
+
content_type: content_type,
|
|
37
|
+
byte_size: body.bytesize
|
|
38
|
+
)
|
|
39
|
+
rescue Faraday::Error, URI::InvalidURIError, Timeout::Error
|
|
40
|
+
nil
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def fetch_image
|
|
46
|
+
connection = Faraday.new do |f|
|
|
47
|
+
f.options.timeout = settings.download_timeout
|
|
48
|
+
f.options.open_timeout = [ settings.download_timeout / 2, 5 ].min
|
|
49
|
+
f.headers["User-Agent"] = SourceMonitor.config.http.user_agent || "SourceMonitor/#{SourceMonitor::VERSION}"
|
|
50
|
+
f.headers["Accept"] = "image/*"
|
|
51
|
+
f.adapter Faraday.default_adapter
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
response = connection.get(url)
|
|
55
|
+
return response if response.status == 200
|
|
56
|
+
|
|
57
|
+
nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def allowed_content_type?(content_type)
|
|
61
|
+
return false if content_type.blank?
|
|
62
|
+
|
|
63
|
+
settings.allowed_content_types.include?(content_type)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def derive_filename(image_url, content_type)
|
|
67
|
+
uri = URI.parse(image_url)
|
|
68
|
+
basename = File.basename(uri.path) if uri.path.present?
|
|
69
|
+
|
|
70
|
+
if basename.present? && basename.include?(".")
|
|
71
|
+
basename
|
|
72
|
+
else
|
|
73
|
+
ext = Rack::Mime::MIME_TYPES.invert[content_type] || ".bin"
|
|
74
|
+
"image-#{SecureRandom.hex(8)}#{ext}"
|
|
75
|
+
end
|
|
76
|
+
rescue URI::InvalidURIError
|
|
77
|
+
ext = Rack::Mime::MIME_TYPES.invert[content_type] || ".bin"
|
|
78
|
+
"image-#{SecureRandom.hex(8)}#{ext}"
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|