source_monitor 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/skills/sm-architecture/SKILL.md +233 -0
  3. data/.claude/skills/sm-architecture/reference/extraction-patterns.md +192 -0
  4. data/.claude/skills/sm-architecture/reference/module-map.md +194 -0
  5. data/.claude/skills/sm-configuration-setting/SKILL.md +264 -0
  6. data/.claude/skills/sm-configuration-setting/reference/settings-catalog.md +248 -0
  7. data/.claude/skills/sm-configuration-setting/reference/settings-pattern.md +297 -0
  8. data/.claude/skills/sm-configure/SKILL.md +153 -0
  9. data/.claude/skills/sm-configure/reference/configuration-reference.md +321 -0
  10. data/.claude/skills/sm-dashboard-widget/SKILL.md +344 -0
  11. data/.claude/skills/sm-dashboard-widget/reference/dashboard-patterns.md +304 -0
  12. data/.claude/skills/sm-domain-model/SKILL.md +188 -0
  13. data/.claude/skills/sm-domain-model/reference/model-graph.md +114 -0
  14. data/.claude/skills/sm-domain-model/reference/table-structure.md +348 -0
  15. data/.claude/skills/sm-engine-migration/SKILL.md +395 -0
  16. data/.claude/skills/sm-engine-migration/reference/migration-conventions.md +255 -0
  17. data/.claude/skills/sm-engine-test/SKILL.md +302 -0
  18. data/.claude/skills/sm-engine-test/reference/test-helpers.md +259 -0
  19. data/.claude/skills/sm-engine-test/reference/test-patterns.md +411 -0
  20. data/.claude/skills/sm-event-handler/SKILL.md +265 -0
  21. data/.claude/skills/sm-event-handler/reference/events-api.md +229 -0
  22. data/.claude/skills/sm-health-rule/SKILL.md +327 -0
  23. data/.claude/skills/sm-health-rule/reference/health-system.md +269 -0
  24. data/.claude/skills/sm-host-setup/SKILL.md +223 -0
  25. data/.claude/skills/sm-host-setup/reference/initializer-template.md +195 -0
  26. data/.claude/skills/sm-host-setup/reference/setup-checklist.md +134 -0
  27. data/.claude/skills/sm-job/SKILL.md +263 -0
  28. data/.claude/skills/sm-job/reference/job-conventions.md +245 -0
  29. data/.claude/skills/sm-model-extension/SKILL.md +287 -0
  30. data/.claude/skills/sm-model-extension/reference/extension-api.md +317 -0
  31. data/.claude/skills/sm-pipeline-stage/SKILL.md +254 -0
  32. data/.claude/skills/sm-pipeline-stage/reference/completion-handlers.md +152 -0
  33. data/.claude/skills/sm-pipeline-stage/reference/entry-processing.md +191 -0
  34. data/.claude/skills/sm-pipeline-stage/reference/feed-fetcher-architecture.md +198 -0
  35. data/.claude/skills/sm-scraper-adapter/SKILL.md +284 -0
  36. data/.claude/skills/sm-scraper-adapter/reference/adapter-contract.md +167 -0
  37. data/.claude/skills/sm-scraper-adapter/reference/example-adapter.md +274 -0
  38. data/.vbw-planning/.notification-log.jsonl +102 -0
  39. data/.vbw-planning/.session-log.jsonl +505 -0
  40. data/AGENTS.md +20 -57
  41. data/CHANGELOG.md +19 -0
  42. data/CLAUDE.md +44 -1
  43. data/CONTRIBUTING.md +5 -5
  44. data/Gemfile.lock +20 -21
  45. data/README.md +18 -5
  46. data/VERSION +1 -0
  47. data/docs/deployment.md +1 -1
  48. data/docs/setup.md +4 -4
  49. data/lib/source_monitor/setup/skills_installer.rb +94 -0
  50. data/lib/source_monitor/setup/workflow.rb +17 -2
  51. data/lib/source_monitor/version.rb +1 -1
  52. data/lib/tasks/source_monitor_setup.rake +58 -0
  53. data/source_monitor.gemspec +1 -0
  54. metadata +39 -1
@@ -0,0 +1,274 @@
1
+ # Example Scraper Adapter
2
+
3
+ A complete working example of a custom scraper adapter.
4
+
5
+ ## Use Case
6
+
7
+ This adapter extracts content from pages that require API-based rendering (e.g., JavaScript-heavy sites that need a headless browser service).
8
+
9
+ ## Implementation
10
+
11
+ ```ruby
12
+ # app/scrapers/my_app/scrapers/headless.rb
13
+ module MyApp
14
+ module Scrapers
15
+ class Headless < SourceMonitor::Scrapers::Base
16
+ # Default settings for this adapter.
17
+ # Overridable per-source via source.scrape_settings JSON column,
18
+ # or per-invocation via the settings parameter.
19
+ def self.default_settings
20
+ {
21
+ render_service_url: ENV.fetch("RENDER_SERVICE_URL", "http://localhost:3001/render"),
22
+ wait_for_selector: "body",
23
+ timeout: 30,
24
+ selectors: {
25
+ content: "article, main, .content",
26
+ title: "h1, title"
27
+ }
28
+ }
29
+ end
30
+
31
+ def call
32
+ url = preferred_url
33
+ return missing_url_result unless url.present?
34
+
35
+ # Step 1: Render the page via headless service
36
+ render_result = render_page(url)
37
+ return fetch_failure(render_result) unless render_result[:success]
38
+
39
+ html = render_result[:body]
40
+
41
+ # Step 2: Extract content using CSS selectors
42
+ content = extract_content(html)
43
+ title = extract_title(html)
44
+
45
+ if content.blank?
46
+ return Result.new(
47
+ status: :partial,
48
+ html: html,
49
+ content: nil,
50
+ metadata: build_metadata(url: url, title: title, note: "No content extracted")
51
+ )
52
+ end
53
+
54
+ Result.new(
55
+ status: :success,
56
+ html: html,
57
+ content: content,
58
+ metadata: build_metadata(url: url, title: title)
59
+ )
60
+ rescue Faraday::TimeoutError => error
61
+ timeout_result(url, error)
62
+ rescue StandardError => error
63
+ error_result(url, error)
64
+ end
65
+
66
+ private
67
+
68
+ def preferred_url
69
+ item.canonical_url.presence || item.url
70
+ end
71
+
72
+ def render_page(url)
73
+ conn = http.client(timeout: settings[:timeout])
74
+ response = conn.post(settings[:render_service_url]) do |req|
75
+ req.headers["Content-Type"] = "application/json"
76
+ req.body = {
77
+ url: url,
78
+ wait_for: settings[:wait_for_selector],
79
+ timeout: (settings[:timeout].to_i * 1000)
80
+ }.to_json
81
+ end
82
+
83
+ if response.status >= 200 && response.status < 300
84
+ { success: true, body: response.body, status: response.status }
85
+ else
86
+ { success: false, status: response.status, error: "HTTP #{response.status}" }
87
+ end
88
+ rescue Faraday::Error => error
89
+ { success: false, error: error.message }
90
+ end
91
+
92
+ def extract_content(html)
93
+ return nil if html.blank?
94
+
95
+ doc = Nokogiri::HTML(html)
96
+ selector = settings.dig(:selectors, :content) || "body"
97
+
98
+ element = doc.at_css(selector)
99
+ return nil unless element
100
+
101
+ # Remove script and style tags
102
+ element.css("script, style, nav, footer, header").each(&:remove)
103
+ element.text.squeeze(" \n").strip
104
+ end
105
+
106
+ def extract_title(html)
107
+ return nil if html.blank?
108
+
109
+ doc = Nokogiri::HTML(html)
110
+ selector = settings.dig(:selectors, :title) || "title"
111
+ doc.at_css(selector)&.text&.strip
112
+ end
113
+
114
+ def build_metadata(url:, title: nil, note: nil)
115
+ meta = {
116
+ url: url,
117
+ extraction_method: "headless",
118
+ title: title
119
+ }
120
+ meta[:note] = note if note
121
+ meta.compact
122
+ end
123
+
124
+ def missing_url_result
125
+ Result.new(
126
+ status: :failed,
127
+ metadata: { error: "missing_url", message: "No URL available for scraping" }
128
+ )
129
+ end
130
+
131
+ def fetch_failure(render_result)
132
+ Result.new(
133
+ status: :failed,
134
+ metadata: {
135
+ error: "render_failed",
136
+ message: render_result[:error] || "Render service returned error",
137
+ http_status: render_result[:status]
138
+ }.compact
139
+ )
140
+ end
141
+
142
+ def timeout_result(url, error)
143
+ Result.new(
144
+ status: :failed,
145
+ metadata: {
146
+ error: "timeout",
147
+ message: error.message,
148
+ url: url
149
+ }
150
+ )
151
+ end
152
+
153
+ def error_result(url, error)
154
+ Result.new(
155
+ status: :failed,
156
+ metadata: {
157
+ error: error.class.name,
158
+ message: error.message,
159
+ url: url
160
+ }
161
+ )
162
+ end
163
+ end
164
+ end
165
+ end
166
+ ```
167
+
168
+ ## Registration
169
+
170
+ ```ruby
171
+ # config/initializers/source_monitor.rb
172
+ SourceMonitor.configure do |config|
173
+ config.scrapers.register(:headless, "MyApp::Scrapers::Headless")
174
+ end
175
+ ```
176
+
177
+ ## Per-Source Settings
178
+
179
+ Override adapter defaults via the source's `scrape_settings` JSON column:
180
+
181
+ ```ruby
182
+ source = SourceMonitor::Source.find(1)
183
+ source.update!(scrape_settings: {
184
+ render_service_url: "https://render.example.com/api/render",
185
+ wait_for_selector: ".article-content",
186
+ timeout: 60,
187
+ selectors: {
188
+ content: ".article-body",
189
+ title: ".article-title h1"
190
+ }
191
+ })
192
+ ```
193
+
194
+ ## Tests
195
+
196
+ ```ruby
197
+ require "test_helper"
198
+ require "webmock/minitest"
199
+
200
+ class HeadlessScraperTest < ActiveSupport::TestCase
201
+ setup do
202
+ @source = create_source!
203
+ @item = @source.items.create!(
204
+ title: "Test Article",
205
+ url: "https://example.com/spa-article",
206
+ external_id: "headless-test-1"
207
+ )
208
+ end
209
+
210
+ test "successfully renders and extracts content" do
211
+ stub_request(:post, "http://localhost:3001/render")
212
+ .to_return(
213
+ status: 200,
214
+ body: <<~HTML
215
+ <html>
216
+ <head><title>Test Page</title></head>
217
+ <body>
218
+ <article>
219
+ <h1>Article Title</h1>
220
+ <p>This is the article content.</p>
221
+ </article>
222
+ </body>
223
+ </html>
224
+ HTML
225
+ )
226
+
227
+ result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
228
+
229
+ assert_equal :success, result.status
230
+ assert_includes result.content, "article content"
231
+ assert_equal "headless", result.metadata[:extraction_method]
232
+ end
233
+
234
+ test "returns failed when render service is down" do
235
+ stub_request(:post, "http://localhost:3001/render")
236
+ .to_return(status: 500, body: "Internal Server Error")
237
+
238
+ result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
239
+
240
+ assert_equal :failed, result.status
241
+ assert_equal "render_failed", result.metadata[:error]
242
+ end
243
+
244
+ test "returns partial when no content found" do
245
+ stub_request(:post, "http://localhost:3001/render")
246
+ .to_return(status: 200, body: "<html><body><nav>Nav only</nav></body></html>")
247
+
248
+ result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
249
+
250
+ assert_equal :partial, result.status
251
+ assert_nil result.content
252
+ end
253
+
254
+ test "handles missing URL" do
255
+ @item.update!(url: nil)
256
+
257
+ result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
258
+
259
+ assert_equal :failed, result.status
260
+ assert_equal "missing_url", result.metadata[:error]
261
+ end
262
+
263
+ test "merges source-level settings" do
264
+ @source.update!(scrape_settings: { timeout: 60 })
265
+
266
+ stub_request(:post, "http://localhost:3001/render")
267
+ .to_return(status: 200, body: "<html><body><article>Content</article></body></html>")
268
+
269
+ result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
270
+
271
+ assert_equal :success, result.status
272
+ end
273
+ end
274
+ ```
@@ -190,3 +190,105 @@
190
190
  "title": "",
191
191
  "message": "Claude Code needs your attention"
192
192
  }
193
+ {
194
+ "timestamp": "2026-02-11T00:46:04Z",
195
+ "type": "idle_prompt",
196
+ "title": "",
197
+ "message": "Claude is waiting for your input"
198
+ }
199
+ {
200
+ "timestamp": "2026-02-11T00:59:14Z",
201
+ "type": "idle_prompt",
202
+ "title": "",
203
+ "message": "Claude is waiting for your input"
204
+ }
205
+ {
206
+ "timestamp": "2026-02-11T03:26:32Z",
207
+ "type": "idle_prompt",
208
+ "title": "",
209
+ "message": "Claude is waiting for your input"
210
+ }
211
+ {
212
+ "timestamp": "2026-02-11T03:46:35Z",
213
+ "type": "idle_prompt",
214
+ "title": "",
215
+ "message": "Claude is waiting for your input"
216
+ }
217
+ {
218
+ "timestamp": "2026-02-11T04:21:35Z",
219
+ "type": "permission_prompt",
220
+ "title": "",
221
+ "message": "Claude Code needs your attention"
222
+ }
223
+ {
224
+ "timestamp": "2026-02-11T04:28:53Z",
225
+ "type": "permission_prompt",
226
+ "title": "",
227
+ "message": "Claude Code needs your approval for the plan"
228
+ }
229
+ {
230
+ "timestamp": "2026-02-11T04:33:09Z",
231
+ "type": "permission_prompt",
232
+ "title": "",
233
+ "message": "Claude Code needs your approval for the plan"
234
+ }
235
+ {
236
+ "timestamp": "2026-02-11T04:46:56Z",
237
+ "type": "idle_prompt",
238
+ "title": "",
239
+ "message": "Claude is waiting for your input"
240
+ }
241
+ {
242
+ "timestamp": "2026-02-11T04:55:02Z",
243
+ "type": "permission_prompt",
244
+ "title": "",
245
+ "message": "Claude Code needs your approval for the plan"
246
+ }
247
+ {
248
+ "timestamp": "2026-02-11T05:06:47Z",
249
+ "type": "idle_prompt",
250
+ "title": "",
251
+ "message": "Claude is waiting for your input"
252
+ }
253
+ {
254
+ "timestamp": "2026-02-11T05:08:54Z",
255
+ "type": "idle_prompt",
256
+ "title": "",
257
+ "message": "Claude is waiting for your input"
258
+ }
259
+ {
260
+ "timestamp": "2026-02-11T05:11:34Z",
261
+ "type": "idle_prompt",
262
+ "title": "",
263
+ "message": "Claude is waiting for your input"
264
+ }
265
+ {
266
+ "timestamp": "2026-02-11T05:14:10Z",
267
+ "type": "idle_prompt",
268
+ "title": "",
269
+ "message": "Claude is waiting for your input"
270
+ }
271
+ {
272
+ "timestamp": "2026-02-11T05:15:43Z",
273
+ "type": "idle_prompt",
274
+ "title": "",
275
+ "message": "Claude is waiting for your input"
276
+ }
277
+ {
278
+ "timestamp": "2026-02-11T05:20:49Z",
279
+ "type": "idle_prompt",
280
+ "title": "",
281
+ "message": "Claude is waiting for your input"
282
+ }
283
+ {
284
+ "timestamp": "2026-02-11T05:24:55Z",
285
+ "type": "idle_prompt",
286
+ "title": "",
287
+ "message": "Claude is waiting for your input"
288
+ }
289
+ {
290
+ "timestamp": "2026-02-11T05:35:24Z",
291
+ "type": "idle_prompt",
292
+ "title": "",
293
+ "message": "Claude is waiting for your input"
294
+ }