source_monitor 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/agent-memory/vbw-vbw-dev/MEMORY.md +34 -0
- data/.claude/agent-memory/vbw-vbw-lead/MEMORY.md +49 -0
- data/.claude/commands/release.md +255 -0
- data/.claude/skills/sm-configure/SKILL.md +13 -2
- data/.claude/skills/sm-configure/reference/configuration-reference.md +33 -0
- data/.claude/skills/sm-host-setup/SKILL.md +21 -3
- data/.claude/skills/sm-host-setup/reference/setup-checklist.md +36 -0
- data/.claude/skills/sm-job/SKILL.md +10 -9
- data/.gitignore +4 -0
- data/.vbw-planning/REQUIREMENTS.md +22 -0
- data/.vbw-planning/ROADMAP.md +125 -0
- data/.vbw-planning/STATE.md +43 -0
- data/.vbw-planning/config.json +3 -1
- data/.vbw-planning/discovery.json +3 -1
- data/.vbw-planning/phases/01-generator-steps/01-CONTEXT.md +33 -0
- data/.vbw-planning/phases/01-generator-steps/01-VERIFICATION.md +86 -0
- data/.vbw-planning/phases/01-generator-steps/PLAN-01-SUMMARY.md +61 -0
- data/.vbw-planning/phases/01-generator-steps/PLAN-01.md +380 -0
- data/.vbw-planning/phases/02-verification/02-VERIFICATION.md +78 -0
- data/.vbw-planning/phases/02-verification/PLAN-01-SUMMARY.md +46 -0
- data/.vbw-planning/phases/02-verification/PLAN-01.md +500 -0
- data/.vbw-planning/phases/03-docs-alignment/03-VERIFICATION.md +89 -0
- data/.vbw-planning/phases/03-docs-alignment/PLAN-01-SUMMARY.md +48 -0
- data/.vbw-planning/phases/03-docs-alignment/PLAN-01.md +456 -0
- data/.vbw-planning/phases/04-dashboard-ux/04-VERIFICATION.md +129 -0
- data/.vbw-planning/phases/04-dashboard-ux/PLAN-01-SUMMARY.md +70 -0
- data/.vbw-planning/phases/04-dashboard-ux/PLAN-01.md +747 -0
- data/.vbw-planning/phases/05-active-storage-images/05-VERIFICATION.md +156 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-01-SUMMARY.md +69 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-01.md +455 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-02-SUMMARY.md +39 -0
- data/.vbw-planning/phases/05-active-storage-images/PLAN-02.md +488 -0
- data/.vbw-planning/phases/06-netflix-feed-fix/06-VERIFICATION.md +100 -0
- data/.vbw-planning/phases/06-netflix-feed-fix/PLAN-01-SUMMARY.md +37 -0
- data/.vbw-planning/phases/06-netflix-feed-fix/PLAN-01.md +345 -0
- data/CHANGELOG.md +43 -0
- data/Gemfile.lock +1 -1
- data/VERSION +1 -1
- data/app/assets/builds/source_monitor/application.css +9 -0
- data/app/helpers/source_monitor/application_helper.rb +38 -0
- data/app/jobs/source_monitor/download_content_images_job.rb +72 -0
- data/app/models/source_monitor/item_content.rb +2 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +9 -0
- data/app/views/source_monitor/items/_details.html.erb +2 -2
- data/app/views/source_monitor/logs/index.html.erb +9 -0
- data/app/views/source_monitor/sources/_details.html.erb +2 -2
- data/app/views/source_monitor/sources/_row.html.erb +1 -1
- data/docs/setup.md +13 -4
- data/docs/troubleshooting.md +38 -7
- data/lib/generators/source_monitor/install/install_generator.rb +201 -0
- data/lib/source_monitor/configuration/http_settings.rb +7 -1
- data/lib/source_monitor/configuration/images_settings.rb +37 -0
- data/lib/source_monitor/configuration.rb +3 -1
- data/lib/source_monitor/dashboard/queries/recent_activity_query.rb +16 -7
- data/lib/source_monitor/dashboard/recent_activity.rb +1 -0
- data/lib/source_monitor/dashboard/recent_activity_presenter.rb +15 -2
- data/lib/source_monitor/fetching/feed_fetcher/entry_processor.rb +13 -0
- data/lib/source_monitor/http.rb +23 -0
- data/lib/source_monitor/images/content_rewriter.rb +81 -0
- data/lib/source_monitor/images/downloader.rb +82 -0
- data/lib/source_monitor/logs/table_presenter.rb +25 -0
- data/lib/source_monitor/setup/procfile_patcher.rb +31 -0
- data/lib/source_monitor/setup/queue_config_patcher.rb +84 -0
- data/lib/source_monitor/setup/verification/recurring_schedule_verifier.rb +102 -0
- data/lib/source_monitor/setup/verification/runner.rb +1 -1
- data/lib/source_monitor/setup/verification/solid_queue_verifier.rb +1 -1
- data/lib/source_monitor/setup/workflow.rb +10 -0
- data/lib/source_monitor/version.rb +1 -1
- data/lib/source_monitor.rb +8 -0
- metadata +34 -3
- data/.vbw-planning/.notification-log.jsonl +0 -294
- data/.vbw-planning/.session-log.jsonl +0 -1376
data/docs/setup.md
CHANGED
|
@@ -48,8 +48,13 @@ This ensures Bundler can load SourceMonitor so the commands below are available.
|
|
|
48
48
|
3. **Start background workers:**
|
|
49
49
|
```bash
|
|
50
50
|
bin/rails solid_queue:start
|
|
51
|
-
bin/jobs --recurring_schedule_file=config/recurring.yml # optional recurring scheduler
|
|
52
51
|
```
|
|
52
|
+
The install generator automatically handles all worker configuration:
|
|
53
|
+
- **Recurring jobs** are configured in `config/recurring.yml` (fetch scheduling, scraping, cleanup).
|
|
54
|
+
- **Procfile.dev** is patched with a `jobs:` entry so `bin/dev` starts Solid Queue alongside the web server.
|
|
55
|
+
- **Queue dispatcher** is patched with `recurring_schedule: config/recurring.yml` in `config/queue.yml` so recurring jobs load on startup.
|
|
56
|
+
|
|
57
|
+
All three steps are idempotent. If any configuration is missing, re-run: `bin/rails generate source_monitor:install`
|
|
53
58
|
|
|
54
59
|
4. **Visit the dashboard** at the chosen mount path, create a source, and trigger “Fetch Now” to validate realtime updates and Solid Queue processing.
|
|
55
60
|
|
|
@@ -83,10 +88,12 @@ Prefer to script each step or plug SourceMonitor into an existing deployment che
|
|
|
83
88
|
| --- | --- | --- |
|
|
84
89
|
| 1 | `gem "source_monitor", github: "dchuk/source_monitor"` | Add the engine to your Gemfile (skip if already present) |
|
|
85
90
|
| 2 | `bundle install` | Install Ruby dependencies |
|
|
86
|
-
| 3 | `bin/rails generate source_monitor:install --mount-path=/source_monitor` | Mount the engine
|
|
91
|
+
| 3 | `bin/rails generate source_monitor:install --mount-path=/source_monitor` | Mount the engine, create the initializer, and configure recurring jobs |
|
|
87
92
|
| 4 | `bin/rails railties:install:migrations FROM=source_monitor` | Copy engine migrations (idempotent) |
|
|
88
93
|
| 5 | `bin/rails db:migrate` | Apply schema updates, including Solid Queue tables |
|
|
89
94
|
| 6 | `bin/rails solid_queue:start` | Ensure jobs process via Solid Queue |
|
|
95
|
+
| 6a | Handled by generator (patches `Procfile.dev`) | Ensure `bin/dev` starts Solid Queue workers |
|
|
96
|
+
| 6b | Handled by generator (patches `config/queue.yml`) | Wire recurring jobs into Solid Queue dispatcher |
|
|
90
97
|
| 7 | `bin/jobs --recurring_schedule_file=config/recurring.yml` | Start recurring scheduler (optional but recommended) |
|
|
91
98
|
| 8 | `bin/source_monitor verify` | Confirm Solid Queue/Action Cable readiness and emit telemetry |
|
|
92
99
|
|
|
@@ -95,11 +102,13 @@ Prefer to script each step or plug SourceMonitor into an existing deployment che
|
|
|
95
102
|
### Step-by-step Details
|
|
96
103
|
|
|
97
104
|
1. **Add the gem** to the host `Gemfile` (GitHub edge or released version) and run `bundle install`. If your host manages node tooling, run `npm install` also.
|
|
98
|
-
2. **Install the engine** via `bin/rails generate source_monitor:install --mount-path=/source_monitor`. The generator mounts the engine, creates `config/initializers/source_monitor.rb`, and
|
|
105
|
+
2. **Install the engine** via `bin/rails generate source_monitor:install --mount-path=/source_monitor`. The generator mounts the engine, creates `config/initializers/source_monitor.rb`, and configures recurring Solid Queue jobs in `config/recurring.yml`. Re-running the generator is safe; it detects existing mounts/initializers and skips entries that are already present.
|
|
99
106
|
3. **Copy migrations** with `bin/rails railties:install:migrations FROM=source_monitor`. This brings in the SourceMonitor tables plus Solid Cable/Queue schema when needed. The command is idempotent—run it again after upgrading the gem.
|
|
100
107
|
4. **Apply database changes** using `bin/rails db:migrate`. If your host already installed Solid Queue migrations manually, delete duplicate files before migrating.
|
|
101
108
|
5. **Wire Action Cable** if necessary. SourceMonitor defaults to Solid Cable; confirm `ApplicationCable::Connection`/`Channel` exist and that `config/initializers/source_monitor.rb` uses the adapter you expect. To switch to Redis, set `config.realtime.adapter = :redis` and `config.realtime.redis_url`.
|
|
102
|
-
6. **Start workers** with `bin/rails solid_queue:start` (or your process manager).
|
|
109
|
+
6. **Start workers** with `bin/rails solid_queue:start` (or your process manager). The install generator automatically configures recurring jobs in `config/recurring.yml` for fetch scheduling, scraping, and cleanup. They'll run with `bin/dev` or `bin/jobs`.
|
|
110
|
+
- **Procfile.dev:** The generator automatically patches `Procfile.dev` with a `jobs:` entry for Solid Queue. Verify the file contains `jobs: bundle exec rake solid_queue:start` after running the generator.
|
|
111
|
+
- **Recurring schedule:** The generator automatically patches `config/queue.yml` dispatchers with `recurring_schedule: config/recurring.yml`. Verify the key is present after running the generator.
|
|
103
112
|
7. **Review the initializer** and tune queue names, HTTP timeouts, scraping adapters, retention limits, authentication hooks, and Mission Control integration. The [configuration reference](configuration.md) details every option.
|
|
104
113
|
8. **Verify the install**: run `bin/source_monitor verify` to ensure Solid Queue workers and Action Cable are healthy, then visit the mount path to trigger a fetch manually. Enable telemetry if you want JSON logs recorded for support.
|
|
105
114
|
|
data/docs/troubleshooting.md
CHANGED
|
@@ -20,45 +20,76 @@ This guide lists common issues you might encounter while installing, upgrading,
|
|
|
20
20
|
- Ensure at least one Solid Queue worker is running; the dashboard reads visibility data via `SourceMonitor::Jobs::Visibility`.
|
|
21
21
|
- When using mission control integration, keep `config.mission_control_dashboard_path` pointing at a valid route helper; otherwise the dashboard hides the link.
|
|
22
22
|
|
|
23
|
-
## 4.
|
|
23
|
+
## 4. Recurring Jobs Not Running
|
|
24
|
+
|
|
25
|
+
- **Symptoms:** Fetch scheduling, scrape scheduling, and cleanup jobs never fire. Sources never auto-fetch on their configured intervals.
|
|
26
|
+
- **Primary fix:** Re-run the install generator, which automatically patches the dispatcher config:
|
|
27
|
+
```bash
|
|
28
|
+
bin/rails generate source_monitor:install
|
|
29
|
+
```
|
|
30
|
+
- **Diagnostics:** Run `bin/source_monitor verify` to check recurring task registration. The RecurringScheduleVerifier will report whether SourceMonitor recurring tasks are loaded into Solid Queue.
|
|
31
|
+
- **Manual check:** Verify `config/queue.yml` includes `recurring_schedule: config/recurring.yml` under the `dispatchers:` section. Without this key, Solid Queue's dispatcher will not load the recurring schedule even though `config/recurring.yml` exists.
|
|
32
|
+
- **Manual fix (if generator cannot patch):**
|
|
33
|
+
```yaml
|
|
34
|
+
dispatchers:
|
|
35
|
+
- polling_interval: 1
|
|
36
|
+
batch_size: 500
|
|
37
|
+
recurring_schedule: config/recurring.yml
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## 5. Jobs Not Processing with bin/dev
|
|
41
|
+
|
|
42
|
+
- **Symptoms:** `bin/dev` starts the web server but jobs never run. Running `bin/rails solid_queue:start` manually works fine.
|
|
43
|
+
- **Primary fix:** Re-run the install generator, which automatically patches `Procfile.dev`:
|
|
44
|
+
```bash
|
|
45
|
+
bin/rails generate source_monitor:install
|
|
46
|
+
```
|
|
47
|
+
- **Diagnostics:** Run `bin/source_monitor verify` to check Solid Queue worker status. The SolidQueueVerifier will suggest Procfile.dev if no workers are detected.
|
|
48
|
+
- **Manual check:** Verify `Procfile.dev` includes a `jobs:` line:
|
|
49
|
+
```
|
|
50
|
+
jobs: bundle exec rake solid_queue:start
|
|
51
|
+
```
|
|
52
|
+
- Most Rails 8 apps use foreman or overmind via `bin/dev`. Without a `jobs:` entry, the process manager only starts the web server and asset watchers -- Solid Queue workers are not launched.
|
|
53
|
+
|
|
54
|
+
## 6. Realtime Updates Do Not Stream
|
|
24
55
|
|
|
25
56
|
- Confirm Action Cable is mounted and `ApplicationCable` classes exist (see installation guide).
|
|
26
57
|
- In production, verify WebSocket proxy settings allow the `/cable` endpoint.
|
|
27
58
|
- When switching to Redis, add `config.realtime.adapter = :redis` and `config.realtime.redis_url` in the initializer, then restart web and worker processes.
|
|
28
59
|
- For Solid Cable, check that the `solid_cable_messages` table exists and that no other process clears it unexpectedly.
|
|
29
60
|
|
|
30
|
-
##
|
|
61
|
+
## 7. Fetch Jobs Keep Failing
|
|
31
62
|
|
|
32
63
|
- Review the most recent fetch log entry for the source; it stores the HTTP status, error class, and error message.
|
|
33
64
|
- Increase `config.http.timeout` or `config.http.retry_max` if the feed is slow or prone to transient errors.
|
|
34
65
|
- Supply custom headers or basic auth credentials via the source form when feeds require authentication.
|
|
35
66
|
- Check for TLS issues on self-signed feeds; you may need to configure Faraday with custom SSL options.
|
|
36
67
|
|
|
37
|
-
##
|
|
68
|
+
## 8. Scraping Returns "Failed"
|
|
38
69
|
|
|
39
70
|
- Confirm the source has scraping enabled and the configured adapter exists.
|
|
40
71
|
- Override selectors in the source's scrape settings if the default Readability extraction misses key elements.
|
|
41
72
|
- Inspect the scrape log to see the adapter status and content length. Logs store the HTTP status and any exception raised by the adapter.
|
|
42
73
|
- Retry manually from the item detail page after fixing selectors.
|
|
43
74
|
|
|
44
|
-
##
|
|
75
|
+
## 9. Cleanup Rake Tasks Fail
|
|
45
76
|
|
|
46
77
|
- Pass numeric values for `FETCH_LOG_DAYS` or `SCRAPE_LOG_DAYS` environment variables (e.g., `FETCH_LOG_DAYS=30`).
|
|
47
78
|
- Ensure workers or the console environment have permission to soft delete (`SOFT_DELETE=true`) if you expect tombstones.
|
|
48
79
|
- If job classes cannot load, verify `SourceMonitor.configure` ran before calling `rake source_monitor:cleanup:*`.
|
|
49
80
|
|
|
50
|
-
##
|
|
81
|
+
## 10. Test Suite Cannot Launch a Browser
|
|
51
82
|
|
|
52
83
|
- System tests rely on Selenium + Chrome. Install Chrome/Chromium and set `SELENIUM_CHROME_BINARY` if the binary lives in a non-standard path.
|
|
53
84
|
- You can run `rbenv exec bin/test-coverage --verbose` to inspect failures with additional logging.
|
|
54
85
|
|
|
55
|
-
##
|
|
86
|
+
## 11. Mission Control Jobs Link Returns 404
|
|
56
87
|
|
|
57
88
|
- Mount `MissionControl::Jobs::Engine` in your host routes (for example, `mount MissionControl::Jobs::Engine, at: "/mission_control"`).
|
|
58
89
|
- Keep `config.mission_control_enabled = true` **and** `config.mission_control_dashboard_path` pointing at that mounted route helper. Call `SourceMonitor.mission_control_dashboard_path` in the Rails console to confirm it resolves.
|
|
59
90
|
- When hosting Mission Control in a separate app, provide a full URL instead of a route helper and ensure CORS/WebSocket settings allow the dashboard iframe.
|
|
60
91
|
|
|
61
|
-
##
|
|
92
|
+
## 12. Tailwind Build Fails or Admin UI Loads Without Styles
|
|
62
93
|
|
|
63
94
|
- Running `test/dummy/bin/dev` before configuring the bundling pipeline will serve the admin UI without Tailwind styles or Stimulus behaviours. This happens because the engine no longer ships precompiled assets; see `.ai/engine-asset-configuration.md:11-44` for the required npm setup.
|
|
64
95
|
- Fix by running `npm install` followed by `npm run build` inside the engine root so that `app/assets/builds/source_monitor/application.css` and `application.js` exist. The Rake task `app:source_monitor:assets:build` wraps the same scripts for CI usage.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "yaml"
|
|
3
4
|
require "rails/generators"
|
|
4
5
|
require "rails/generators/base"
|
|
5
6
|
|
|
@@ -32,7 +33,67 @@ module SourceMonitor
|
|
|
32
33
|
template "source_monitor.rb.tt", initializer_path
|
|
33
34
|
end
|
|
34
35
|
|
|
36
|
+
def configure_recurring_jobs
|
|
37
|
+
recurring_path = "config/recurring.yml"
|
|
38
|
+
destination = File.join(destination_root, recurring_path)
|
|
39
|
+
|
|
40
|
+
if recurring_file_has_source_monitor_entries?(destination)
|
|
41
|
+
say_status :skip, "#{recurring_path} (SourceMonitor entries already present)", :yellow
|
|
42
|
+
return
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
if File.exist?(destination)
|
|
46
|
+
merge_into_existing_recurring(destination, recurring_path)
|
|
47
|
+
else
|
|
48
|
+
create_recurring_file(destination, recurring_path)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def patch_procfile_dev
|
|
53
|
+
procfile_path = File.join(destination_root, "Procfile.dev")
|
|
54
|
+
|
|
55
|
+
if File.exist?(procfile_path)
|
|
56
|
+
content = File.read(procfile_path)
|
|
57
|
+
if content.match?(/^jobs:/)
|
|
58
|
+
say_status :skip, "Procfile.dev (jobs entry already present)", :yellow
|
|
59
|
+
return
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
File.open(procfile_path, "a") { |f| f.puts("", PROCFILE_JOBS_ENTRY) }
|
|
63
|
+
say_status :append, "Procfile.dev", :green
|
|
64
|
+
else
|
|
65
|
+
File.write(procfile_path, "web: bin/rails server -p 3000\n#{PROCFILE_JOBS_ENTRY}\n")
|
|
66
|
+
say_status :create, "Procfile.dev", :green
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def configure_queue_dispatcher
|
|
71
|
+
queue_path = File.join(destination_root, "config/queue.yml")
|
|
72
|
+
|
|
73
|
+
unless File.exist?(queue_path)
|
|
74
|
+
say_status :skip, "config/queue.yml (file not found — create it or run rails app:update to generate)", :yellow
|
|
75
|
+
return
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
parsed = YAML.safe_load(File.read(queue_path), aliases: true) || {}
|
|
79
|
+
|
|
80
|
+
if queue_config_has_recurring_schedule?(parsed)
|
|
81
|
+
say_status :skip, "config/queue.yml (recurring_schedule already configured)", :yellow
|
|
82
|
+
return
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
add_recurring_schedule_to_dispatchers!(parsed)
|
|
86
|
+
File.write(queue_path, YAML.dump(parsed))
|
|
87
|
+
say_status :append, "config/queue.yml (added recurring_schedule to dispatchers)", :green
|
|
88
|
+
end
|
|
89
|
+
|
|
35
90
|
def print_next_steps
|
|
91
|
+
say_status :info,
|
|
92
|
+
"Procfile.dev configured — run bin/dev to start both web server and Solid Queue workers.",
|
|
93
|
+
:green
|
|
94
|
+
say_status :info,
|
|
95
|
+
"Recurring jobs configured in config/recurring.yml — they'll run automatically with bin/dev or bin/jobs.",
|
|
96
|
+
:green
|
|
36
97
|
say_status :info,
|
|
37
98
|
"Next steps: review docs/setup.md for the guided + manual install walkthrough and docs/troubleshooting.md for common fixes.",
|
|
38
99
|
:green
|
|
@@ -40,6 +101,88 @@ module SourceMonitor
|
|
|
40
101
|
|
|
41
102
|
private
|
|
42
103
|
|
|
104
|
+
PROCFILE_JOBS_ENTRY = "jobs: bundle exec rake solid_queue:start"
|
|
105
|
+
|
|
106
|
+
RECURRING_ENTRIES = {
|
|
107
|
+
"source_monitor_schedule_fetches" => {
|
|
108
|
+
"class" => "SourceMonitor::ScheduleFetchesJob",
|
|
109
|
+
"args" => [ { "limit" => 100 } ],
|
|
110
|
+
"schedule" => "every minute"
|
|
111
|
+
},
|
|
112
|
+
"source_monitor_schedule_scrapes" => {
|
|
113
|
+
"command" => "SourceMonitor::Scraping::Scheduler.run(limit: 100)",
|
|
114
|
+
"schedule" => "every 2 minutes"
|
|
115
|
+
},
|
|
116
|
+
"source_monitor_item_cleanup" => {
|
|
117
|
+
"class" => "SourceMonitor::ItemCleanupJob",
|
|
118
|
+
"schedule" => "at 2am every day"
|
|
119
|
+
},
|
|
120
|
+
"source_monitor_log_cleanup" => {
|
|
121
|
+
"class" => "SourceMonitor::LogCleanupJob",
|
|
122
|
+
"args" => [ { "fetch_logs_older_than_days" => 90, "scrape_logs_older_than_days" => 60 } ],
|
|
123
|
+
"schedule" => "at 3am every day"
|
|
124
|
+
}
|
|
125
|
+
}.freeze
|
|
126
|
+
|
|
127
|
+
def recurring_file_has_source_monitor_entries?(path)
|
|
128
|
+
return false unless File.exist?(path)
|
|
129
|
+
|
|
130
|
+
content = File.read(path)
|
|
131
|
+
content.include?("source_monitor_schedule_fetches")
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def merge_into_existing_recurring(destination, recurring_path)
|
|
135
|
+
parsed = YAML.safe_load(File.read(destination), aliases: true) || {}
|
|
136
|
+
default_key = parsed.key?("default") ? "default" : nil
|
|
137
|
+
|
|
138
|
+
if default_key
|
|
139
|
+
parsed["default"] = (parsed["default"] || {}).merge(RECURRING_ENTRIES)
|
|
140
|
+
else
|
|
141
|
+
parsed.merge!(RECURRING_ENTRIES)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
write_recurring_yaml(destination, parsed, has_environments: parsed.key?("development"))
|
|
145
|
+
say_status :append, recurring_path, :green
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def create_recurring_file(destination, recurring_path)
|
|
149
|
+
FileUtils.mkdir_p(File.dirname(destination))
|
|
150
|
+
yaml_content = build_fresh_recurring_yaml
|
|
151
|
+
File.write(destination, yaml_content)
|
|
152
|
+
say_status :create, recurring_path, :green
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def build_fresh_recurring_yaml
|
|
156
|
+
entries_yaml = format_entries_yaml(RECURRING_ENTRIES)
|
|
157
|
+
|
|
158
|
+
"default: &default\n#{entries_yaml}\n" \
|
|
159
|
+
"development:\n <<: *default\n\n" \
|
|
160
|
+
"test:\n <<: *default\n\n" \
|
|
161
|
+
"production:\n <<: *default\n"
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def write_recurring_yaml(destination, parsed, has_environments: false)
|
|
165
|
+
if has_environments
|
|
166
|
+
default_entries = parsed["default"] || {}
|
|
167
|
+
entries_yaml = format_entries_yaml(default_entries)
|
|
168
|
+
envs = %w[development test production].select { |e| parsed.key?(e) }
|
|
169
|
+
env_sections = envs.map { |e| "#{e}:\n <<: *default" }.join("\n\n")
|
|
170
|
+
|
|
171
|
+
content = "default: &default\n#{entries_yaml}"
|
|
172
|
+
content += "\n#{env_sections}\n" unless envs.empty?
|
|
173
|
+
File.write(destination, content)
|
|
174
|
+
else
|
|
175
|
+
File.write(destination, YAML.dump(parsed))
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def format_entries_yaml(entries)
|
|
180
|
+
entries.map { |key, value|
|
|
181
|
+
entry = YAML.dump({ key => value }).delete_prefix("---\n")
|
|
182
|
+
entry.gsub(/^/, " ")
|
|
183
|
+
}.join("\n")
|
|
184
|
+
end
|
|
185
|
+
|
|
43
186
|
def engine_already_mounted?(mount_path)
|
|
44
187
|
routes_path = File.join(destination_root, "config/routes.rb")
|
|
45
188
|
return false unless File.exist?(routes_path)
|
|
@@ -54,6 +197,64 @@ module SourceMonitor
|
|
|
54
197
|
path = (raw_path && !raw_path.strip.empty?) ? raw_path.strip : "/source_monitor"
|
|
55
198
|
path.start_with?("/") ? path : "/#{path}"
|
|
56
199
|
end
|
|
200
|
+
|
|
201
|
+
RECURRING_SCHEDULE_VALUE = "config/recurring.yml"
|
|
202
|
+
|
|
203
|
+
DEFAULT_DISPATCHER = {
|
|
204
|
+
"polling_interval" => 1,
|
|
205
|
+
"batch_size" => 500,
|
|
206
|
+
"recurring_schedule" => RECURRING_SCHEDULE_VALUE
|
|
207
|
+
}.freeze
|
|
208
|
+
|
|
209
|
+
def queue_config_has_recurring_schedule?(parsed)
|
|
210
|
+
parsed.each_value do |value|
|
|
211
|
+
next unless value.is_a?(Hash)
|
|
212
|
+
|
|
213
|
+
dispatchers = value["dispatchers"] || value[:dispatchers]
|
|
214
|
+
if dispatchers.is_a?(Array)
|
|
215
|
+
return true if dispatchers.any? { |d| d.is_a?(Hash) && d.key?("recurring_schedule") }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
return true if queue_config_has_recurring_schedule?(value)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Check top-level dispatchers (flat config)
|
|
222
|
+
if parsed.key?("dispatchers") && parsed["dispatchers"].is_a?(Array)
|
|
223
|
+
return true if parsed["dispatchers"].any? { |d| d.is_a?(Hash) && d.key?("recurring_schedule") }
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
false
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def add_recurring_schedule_to_dispatchers!(parsed)
|
|
230
|
+
found_dispatchers = false
|
|
231
|
+
|
|
232
|
+
parsed.each_value do |value|
|
|
233
|
+
next unless value.is_a?(Hash)
|
|
234
|
+
|
|
235
|
+
if value.key?("dispatchers") && value["dispatchers"].is_a?(Array)
|
|
236
|
+
value["dispatchers"].each do |dispatcher|
|
|
237
|
+
next unless dispatcher.is_a?(Hash)
|
|
238
|
+
dispatcher["recurring_schedule"] ||= RECURRING_SCHEDULE_VALUE
|
|
239
|
+
end
|
|
240
|
+
found_dispatchers = true
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Check top-level dispatchers (flat config)
|
|
245
|
+
if parsed.key?("dispatchers") && parsed["dispatchers"].is_a?(Array)
|
|
246
|
+
parsed["dispatchers"].each do |dispatcher|
|
|
247
|
+
next unless dispatcher.is_a?(Hash)
|
|
248
|
+
dispatcher["recurring_schedule"] ||= RECURRING_SCHEDULE_VALUE
|
|
249
|
+
end
|
|
250
|
+
found_dispatchers = true
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# No dispatchers found at all — add a default section
|
|
254
|
+
unless found_dispatchers
|
|
255
|
+
parsed["dispatchers"] = [ DEFAULT_DISPATCHER.dup ]
|
|
256
|
+
end
|
|
257
|
+
end
|
|
57
258
|
end
|
|
58
259
|
end
|
|
59
260
|
end
|
|
@@ -13,7 +13,10 @@ module SourceMonitor
|
|
|
13
13
|
:retry_interval,
|
|
14
14
|
:retry_interval_randomness,
|
|
15
15
|
:retry_backoff_factor,
|
|
16
|
-
:retry_statuses
|
|
16
|
+
:retry_statuses,
|
|
17
|
+
:ssl_ca_file,
|
|
18
|
+
:ssl_ca_path,
|
|
19
|
+
:ssl_verify
|
|
17
20
|
|
|
18
21
|
def initialize
|
|
19
22
|
reset!
|
|
@@ -31,6 +34,9 @@ module SourceMonitor
|
|
|
31
34
|
@retry_interval_randomness = 0.5
|
|
32
35
|
@retry_backoff_factor = 2
|
|
33
36
|
@retry_statuses = nil
|
|
37
|
+
@ssl_ca_file = nil
|
|
38
|
+
@ssl_ca_path = nil
|
|
39
|
+
@ssl_verify = true
|
|
34
40
|
end
|
|
35
41
|
|
|
36
42
|
private
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class Configuration
|
|
5
|
+
class ImagesSettings
|
|
6
|
+
attr_accessor :download_to_active_storage,
|
|
7
|
+
:max_download_size,
|
|
8
|
+
:download_timeout,
|
|
9
|
+
:allowed_content_types
|
|
10
|
+
|
|
11
|
+
DEFAULT_MAX_DOWNLOAD_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
12
|
+
DEFAULT_DOWNLOAD_TIMEOUT = 30 # seconds
|
|
13
|
+
DEFAULT_ALLOWED_CONTENT_TYPES = %w[
|
|
14
|
+
image/jpeg
|
|
15
|
+
image/png
|
|
16
|
+
image/gif
|
|
17
|
+
image/webp
|
|
18
|
+
image/svg+xml
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
def initialize
|
|
22
|
+
reset!
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def reset!
|
|
26
|
+
@download_to_active_storage = false
|
|
27
|
+
@max_download_size = DEFAULT_MAX_DOWNLOAD_SIZE
|
|
28
|
+
@download_timeout = DEFAULT_DOWNLOAD_TIMEOUT
|
|
29
|
+
@allowed_content_types = DEFAULT_ALLOWED_CONTENT_TYPES.dup
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def download_enabled?
|
|
33
|
+
!!download_to_active_storage
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -8,6 +8,7 @@ require "source_monitor/configuration/scraping_settings"
|
|
|
8
8
|
require "source_monitor/configuration/realtime_settings"
|
|
9
9
|
require "source_monitor/configuration/retention_settings"
|
|
10
10
|
require "source_monitor/configuration/authentication_settings"
|
|
11
|
+
require "source_monitor/configuration/images_settings"
|
|
11
12
|
require "source_monitor/configuration/scraper_registry"
|
|
12
13
|
require "source_monitor/configuration/events"
|
|
13
14
|
require "source_monitor/configuration/validation_definition"
|
|
@@ -26,7 +27,7 @@ module SourceMonitor
|
|
|
26
27
|
:mission_control_enabled,
|
|
27
28
|
:mission_control_dashboard_path
|
|
28
29
|
|
|
29
|
-
attr_reader :http, :scrapers, :retention, :events, :models, :realtime, :fetching, :health, :authentication, :scraping
|
|
30
|
+
attr_reader :http, :scrapers, :retention, :events, :models, :realtime, :fetching, :health, :authentication, :scraping, :images
|
|
30
31
|
|
|
31
32
|
DEFAULT_QUEUE_NAMESPACE = "source_monitor"
|
|
32
33
|
|
|
@@ -50,6 +51,7 @@ module SourceMonitor
|
|
|
50
51
|
@health = HealthSettings.new
|
|
51
52
|
@authentication = AuthenticationSettings.new
|
|
52
53
|
@scraping = ScrapingSettings.new
|
|
54
|
+
@images = ImagesSettings.new
|
|
53
55
|
end
|
|
54
56
|
|
|
55
57
|
def queue_name_for(role)
|
|
@@ -37,7 +37,8 @@ module SourceMonitor
|
|
|
37
37
|
item_title: row["item_title"],
|
|
38
38
|
item_url: row["item_url"],
|
|
39
39
|
source_name: row["source_name"],
|
|
40
|
-
source_id: row["source_id"]
|
|
40
|
+
source_id: row["source_id"],
|
|
41
|
+
source_feed_url: row["source_feed_url"]
|
|
41
42
|
)
|
|
42
43
|
end
|
|
43
44
|
|
|
@@ -57,7 +58,8 @@ module SourceMonitor
|
|
|
57
58
|
item_title,
|
|
58
59
|
item_url,
|
|
59
60
|
source_name,
|
|
60
|
-
source_id
|
|
61
|
+
source_id,
|
|
62
|
+
source_feed_url
|
|
61
63
|
FROM (
|
|
62
64
|
#{fetch_log_sql}
|
|
63
65
|
UNION ALL
|
|
@@ -83,9 +85,12 @@ module SourceMonitor
|
|
|
83
85
|
NULL AS scraper_adapter,
|
|
84
86
|
NULL AS item_title,
|
|
85
87
|
NULL AS item_url,
|
|
86
|
-
|
|
87
|
-
#{SourceMonitor::FetchLog.quoted_table_name}.source_id AS source_id
|
|
88
|
+
#{SourceMonitor::Source.quoted_table_name}.#{quoted_source_name} AS source_name,
|
|
89
|
+
#{SourceMonitor::FetchLog.quoted_table_name}.source_id AS source_id,
|
|
90
|
+
#{SourceMonitor::Source.quoted_table_name}.feed_url AS source_feed_url
|
|
88
91
|
FROM #{SourceMonitor::FetchLog.quoted_table_name}
|
|
92
|
+
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
93
|
+
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::FetchLog.quoted_table_name}.source_id
|
|
89
94
|
SQL
|
|
90
95
|
end
|
|
91
96
|
|
|
@@ -100,12 +105,15 @@ module SourceMonitor
|
|
|
100
105
|
NULL AS items_updated,
|
|
101
106
|
#{SourceMonitor::ScrapeLog.quoted_table_name}.scraper_adapter AS scraper_adapter,
|
|
102
107
|
NULL AS item_title,
|
|
103
|
-
|
|
108
|
+
#{SourceMonitor::Item.quoted_table_name}.url AS item_url,
|
|
104
109
|
#{SourceMonitor::Source.quoted_table_name}.#{quoted_source_name} AS source_name,
|
|
105
|
-
#{SourceMonitor::ScrapeLog.quoted_table_name}.source_id AS source_id
|
|
110
|
+
#{SourceMonitor::ScrapeLog.quoted_table_name}.source_id AS source_id,
|
|
111
|
+
NULL AS source_feed_url
|
|
106
112
|
FROM #{SourceMonitor::ScrapeLog.quoted_table_name}
|
|
107
113
|
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
108
114
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::ScrapeLog.quoted_table_name}.source_id
|
|
115
|
+
LEFT JOIN #{SourceMonitor::Item.quoted_table_name}
|
|
116
|
+
ON #{SourceMonitor::Item.quoted_table_name}.id = #{SourceMonitor::ScrapeLog.quoted_table_name}.item_id
|
|
109
117
|
SQL
|
|
110
118
|
end
|
|
111
119
|
|
|
@@ -122,7 +130,8 @@ module SourceMonitor
|
|
|
122
130
|
#{SourceMonitor::Item.quoted_table_name}.title AS item_title,
|
|
123
131
|
#{SourceMonitor::Item.quoted_table_name}.url AS item_url,
|
|
124
132
|
#{SourceMonitor::Source.quoted_table_name}.#{quoted_source_name} AS source_name,
|
|
125
|
-
#{SourceMonitor::Item.quoted_table_name}.source_id AS source_id
|
|
133
|
+
#{SourceMonitor::Item.quoted_table_name}.source_id AS source_id,
|
|
134
|
+
NULL AS source_feed_url
|
|
126
135
|
FROM #{SourceMonitor::Item.quoted_table_name}
|
|
127
136
|
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
128
137
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::Item.quoted_table_name}.source_id
|
|
@@ -30,13 +30,16 @@ module SourceMonitor
|
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
def fetch_event(event)
|
|
33
|
+
domain = source_domain(event.source_feed_url)
|
|
33
34
|
{
|
|
34
35
|
label: "Fetch ##{event.id}",
|
|
35
36
|
description: "#{event.items_created.to_i} created / #{event.items_updated.to_i} updated",
|
|
36
37
|
status: event.success? ? :success : :failure,
|
|
37
38
|
type: :fetch,
|
|
38
39
|
time: event.occurred_at,
|
|
39
|
-
path: url_helpers.fetch_log_path(event.id)
|
|
40
|
+
path: url_helpers.fetch_log_path(event.id),
|
|
41
|
+
url_display: domain,
|
|
42
|
+
url_href: event.source_feed_url
|
|
40
43
|
}
|
|
41
44
|
end
|
|
42
45
|
|
|
@@ -47,10 +50,20 @@ module SourceMonitor
|
|
|
47
50
|
status: event.success? ? :success : :failure,
|
|
48
51
|
type: :scrape,
|
|
49
52
|
time: event.occurred_at,
|
|
50
|
-
path: url_helpers.scrape_log_path(event.id)
|
|
53
|
+
path: url_helpers.scrape_log_path(event.id),
|
|
54
|
+
url_display: event.item_url,
|
|
55
|
+
url_href: event.item_url
|
|
51
56
|
}
|
|
52
57
|
end
|
|
53
58
|
|
|
59
|
+
def source_domain(feed_url)
|
|
60
|
+
return nil if feed_url.blank?
|
|
61
|
+
|
|
62
|
+
URI.parse(feed_url.to_s).host
|
|
63
|
+
rescue URI::InvalidURIError
|
|
64
|
+
nil
|
|
65
|
+
end
|
|
66
|
+
|
|
54
67
|
def item_event(event)
|
|
55
68
|
{
|
|
56
69
|
label: event.item_title.presence || "New Item",
|
|
@@ -38,6 +38,7 @@ module SourceMonitor
|
|
|
38
38
|
created += 1
|
|
39
39
|
created_items << result.item
|
|
40
40
|
SourceMonitor::Events.after_item_created(item: result.item, source:, entry:, result: result)
|
|
41
|
+
enqueue_image_download(result.item)
|
|
41
42
|
else
|
|
42
43
|
updated += 1
|
|
43
44
|
updated_items << result.item
|
|
@@ -61,6 +62,18 @@ module SourceMonitor
|
|
|
61
62
|
|
|
62
63
|
private
|
|
63
64
|
|
|
65
|
+
def enqueue_image_download(item)
|
|
66
|
+
return unless SourceMonitor.config.images.download_enabled?
|
|
67
|
+
return if item.content.blank?
|
|
68
|
+
|
|
69
|
+
SourceMonitor::DownloadContentImagesJob.perform_later(item.id)
|
|
70
|
+
rescue StandardError => error
|
|
71
|
+
# Image download enqueue failure must never break feed processing
|
|
72
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
73
|
+
Rails.logger.error("[SourceMonitor] Failed to enqueue image download for item #{item.id}: #{error.message}")
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
64
77
|
def normalize_item_error(entry, error)
|
|
65
78
|
{
|
|
66
79
|
guid: safe_entry_guid(entry),
|
data/lib/source_monitor/http.rb
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "openssl"
|
|
3
4
|
require "faraday"
|
|
4
5
|
require "faraday/retry"
|
|
5
6
|
require "faraday/follow_redirects"
|
|
@@ -57,9 +58,31 @@ module SourceMonitor
|
|
|
57
58
|
connection.headers[key] = value
|
|
58
59
|
end
|
|
59
60
|
|
|
61
|
+
configure_ssl(connection, settings)
|
|
62
|
+
|
|
60
63
|
connection.adapter Faraday.default_adapter
|
|
61
64
|
end
|
|
62
65
|
|
|
66
|
+
# Configure SSL to use a proper cert store. Without this, some systems
|
|
67
|
+
# fail to verify certificate chains that depend on intermediate CAs
|
|
68
|
+
# (e.g., Medium/Netflix on AWS). OpenSSL::X509::Store#set_default_paths
|
|
69
|
+
# loads all system-trusted CAs including intermediates.
|
|
70
|
+
def configure_ssl(connection, settings)
|
|
71
|
+
connection.ssl.verify = settings.ssl_verify != false
|
|
72
|
+
|
|
73
|
+
if settings.ssl_ca_file
|
|
74
|
+
connection.ssl.ca_file = settings.ssl_ca_file
|
|
75
|
+
elsif settings.ssl_ca_path
|
|
76
|
+
connection.ssl.ca_path = settings.ssl_ca_path
|
|
77
|
+
else
|
|
78
|
+
connection.ssl.cert_store = default_cert_store
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def default_cert_store
|
|
83
|
+
OpenSSL::X509::Store.new.tap(&:set_default_paths)
|
|
84
|
+
end
|
|
85
|
+
|
|
63
86
|
def default_headers(settings)
|
|
64
87
|
base_headers = {
|
|
65
88
|
"User-Agent" => resolve_callable(settings.user_agent).presence || DEFAULT_USER_AGENT,
|