source_monitor 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/skills/sm-architecture/SKILL.md +233 -0
- data/.claude/skills/sm-architecture/reference/extraction-patterns.md +192 -0
- data/.claude/skills/sm-architecture/reference/module-map.md +194 -0
- data/.claude/skills/sm-configuration-setting/SKILL.md +264 -0
- data/.claude/skills/sm-configuration-setting/reference/settings-catalog.md +248 -0
- data/.claude/skills/sm-configuration-setting/reference/settings-pattern.md +297 -0
- data/.claude/skills/sm-configure/SKILL.md +153 -0
- data/.claude/skills/sm-configure/reference/configuration-reference.md +321 -0
- data/.claude/skills/sm-dashboard-widget/SKILL.md +344 -0
- data/.claude/skills/sm-dashboard-widget/reference/dashboard-patterns.md +304 -0
- data/.claude/skills/sm-domain-model/SKILL.md +188 -0
- data/.claude/skills/sm-domain-model/reference/model-graph.md +114 -0
- data/.claude/skills/sm-domain-model/reference/table-structure.md +348 -0
- data/.claude/skills/sm-engine-migration/SKILL.md +395 -0
- data/.claude/skills/sm-engine-migration/reference/migration-conventions.md +255 -0
- data/.claude/skills/sm-engine-test/SKILL.md +302 -0
- data/.claude/skills/sm-engine-test/reference/test-helpers.md +259 -0
- data/.claude/skills/sm-engine-test/reference/test-patterns.md +411 -0
- data/.claude/skills/sm-event-handler/SKILL.md +265 -0
- data/.claude/skills/sm-event-handler/reference/events-api.md +229 -0
- data/.claude/skills/sm-health-rule/SKILL.md +327 -0
- data/.claude/skills/sm-health-rule/reference/health-system.md +269 -0
- data/.claude/skills/sm-host-setup/SKILL.md +223 -0
- data/.claude/skills/sm-host-setup/reference/initializer-template.md +195 -0
- data/.claude/skills/sm-host-setup/reference/setup-checklist.md +134 -0
- data/.claude/skills/sm-job/SKILL.md +263 -0
- data/.claude/skills/sm-job/reference/job-conventions.md +245 -0
- data/.claude/skills/sm-model-extension/SKILL.md +287 -0
- data/.claude/skills/sm-model-extension/reference/extension-api.md +317 -0
- data/.claude/skills/sm-pipeline-stage/SKILL.md +254 -0
- data/.claude/skills/sm-pipeline-stage/reference/completion-handlers.md +152 -0
- data/.claude/skills/sm-pipeline-stage/reference/entry-processing.md +191 -0
- data/.claude/skills/sm-pipeline-stage/reference/feed-fetcher-architecture.md +198 -0
- data/.claude/skills/sm-scraper-adapter/SKILL.md +284 -0
- data/.claude/skills/sm-scraper-adapter/reference/adapter-contract.md +167 -0
- data/.claude/skills/sm-scraper-adapter/reference/example-adapter.md +274 -0
- data/.vbw-planning/.notification-log.jsonl +102 -0
- data/.vbw-planning/.session-log.jsonl +505 -0
- data/AGENTS.md +20 -57
- data/CHANGELOG.md +19 -0
- data/CLAUDE.md +44 -1
- data/CONTRIBUTING.md +5 -5
- data/Gemfile.lock +20 -21
- data/README.md +18 -5
- data/VERSION +1 -0
- data/docs/deployment.md +1 -1
- data/docs/setup.md +4 -4
- data/lib/source_monitor/setup/skills_installer.rb +94 -0
- data/lib/source_monitor/setup/workflow.rb +17 -2
- data/lib/source_monitor/version.rb +1 -1
- data/lib/tasks/source_monitor_setup.rake +58 -0
- data/source_monitor.gemspec +1 -0
- metadata +39 -1
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
# Example Scraper Adapter
|
|
2
|
+
|
|
3
|
+
A complete working example of a custom scraper adapter.
|
|
4
|
+
|
|
5
|
+
## Use Case
|
|
6
|
+
|
|
7
|
+
This adapter extracts content from pages that require API-based rendering (e.g., JavaScript-heavy sites that need a headless browser service).
|
|
8
|
+
|
|
9
|
+
## Implementation
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
# app/scrapers/my_app/scrapers/headless.rb
|
|
13
|
+
module MyApp
|
|
14
|
+
module Scrapers
|
|
15
|
+
class Headless < SourceMonitor::Scrapers::Base
|
|
16
|
+
# Default settings for this adapter.
|
|
17
|
+
# Overridable per-source via source.scrape_settings JSON column,
|
|
18
|
+
# or per-invocation via the settings parameter.
|
|
19
|
+
def self.default_settings
|
|
20
|
+
{
|
|
21
|
+
render_service_url: ENV.fetch("RENDER_SERVICE_URL", "http://localhost:3001/render"),
|
|
22
|
+
wait_for_selector: "body",
|
|
23
|
+
timeout: 30,
|
|
24
|
+
selectors: {
|
|
25
|
+
content: "article, main, .content",
|
|
26
|
+
title: "h1, title"
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def call
|
|
32
|
+
url = preferred_url
|
|
33
|
+
return missing_url_result unless url.present?
|
|
34
|
+
|
|
35
|
+
# Step 1: Render the page via headless service
|
|
36
|
+
render_result = render_page(url)
|
|
37
|
+
return fetch_failure(render_result) unless render_result[:success]
|
|
38
|
+
|
|
39
|
+
html = render_result[:body]
|
|
40
|
+
|
|
41
|
+
# Step 2: Extract content using CSS selectors
|
|
42
|
+
content = extract_content(html)
|
|
43
|
+
title = extract_title(html)
|
|
44
|
+
|
|
45
|
+
if content.blank?
|
|
46
|
+
return Result.new(
|
|
47
|
+
status: :partial,
|
|
48
|
+
html: html,
|
|
49
|
+
content: nil,
|
|
50
|
+
metadata: build_metadata(url: url, title: title, note: "No content extracted")
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
Result.new(
|
|
55
|
+
status: :success,
|
|
56
|
+
html: html,
|
|
57
|
+
content: content,
|
|
58
|
+
metadata: build_metadata(url: url, title: title)
|
|
59
|
+
)
|
|
60
|
+
rescue Faraday::TimeoutError => error
|
|
61
|
+
timeout_result(url, error)
|
|
62
|
+
rescue StandardError => error
|
|
63
|
+
error_result(url, error)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def preferred_url
|
|
69
|
+
item.canonical_url.presence || item.url
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def render_page(url)
|
|
73
|
+
conn = http.client(timeout: settings[:timeout])
|
|
74
|
+
response = conn.post(settings[:render_service_url]) do |req|
|
|
75
|
+
req.headers["Content-Type"] = "application/json"
|
|
76
|
+
req.body = {
|
|
77
|
+
url: url,
|
|
78
|
+
wait_for: settings[:wait_for_selector],
|
|
79
|
+
timeout: (settings[:timeout].to_i * 1000)
|
|
80
|
+
}.to_json
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
if response.status >= 200 && response.status < 300
|
|
84
|
+
{ success: true, body: response.body, status: response.status }
|
|
85
|
+
else
|
|
86
|
+
{ success: false, status: response.status, error: "HTTP #{response.status}" }
|
|
87
|
+
end
|
|
88
|
+
rescue Faraday::Error => error
|
|
89
|
+
{ success: false, error: error.message }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def extract_content(html)
|
|
93
|
+
return nil if html.blank?
|
|
94
|
+
|
|
95
|
+
doc = Nokogiri::HTML(html)
|
|
96
|
+
selector = settings.dig(:selectors, :content) || "body"
|
|
97
|
+
|
|
98
|
+
element = doc.at_css(selector)
|
|
99
|
+
return nil unless element
|
|
100
|
+
|
|
101
|
+
# Remove script and style tags
|
|
102
|
+
element.css("script, style, nav, footer, header").each(&:remove)
|
|
103
|
+
element.text.squeeze(" \n").strip
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def extract_title(html)
|
|
107
|
+
return nil if html.blank?
|
|
108
|
+
|
|
109
|
+
doc = Nokogiri::HTML(html)
|
|
110
|
+
selector = settings.dig(:selectors, :title) || "title"
|
|
111
|
+
doc.at_css(selector)&.text&.strip
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def build_metadata(url:, title: nil, note: nil)
|
|
115
|
+
meta = {
|
|
116
|
+
url: url,
|
|
117
|
+
extraction_method: "headless",
|
|
118
|
+
title: title
|
|
119
|
+
}
|
|
120
|
+
meta[:note] = note if note
|
|
121
|
+
meta.compact
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def missing_url_result
|
|
125
|
+
Result.new(
|
|
126
|
+
status: :failed,
|
|
127
|
+
metadata: { error: "missing_url", message: "No URL available for scraping" }
|
|
128
|
+
)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def fetch_failure(render_result)
|
|
132
|
+
Result.new(
|
|
133
|
+
status: :failed,
|
|
134
|
+
metadata: {
|
|
135
|
+
error: "render_failed",
|
|
136
|
+
message: render_result[:error] || "Render service returned error",
|
|
137
|
+
http_status: render_result[:status]
|
|
138
|
+
}.compact
|
|
139
|
+
)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def timeout_result(url, error)
|
|
143
|
+
Result.new(
|
|
144
|
+
status: :failed,
|
|
145
|
+
metadata: {
|
|
146
|
+
error: "timeout",
|
|
147
|
+
message: error.message,
|
|
148
|
+
url: url
|
|
149
|
+
}
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def error_result(url, error)
|
|
154
|
+
Result.new(
|
|
155
|
+
status: :failed,
|
|
156
|
+
metadata: {
|
|
157
|
+
error: error.class.name,
|
|
158
|
+
message: error.message,
|
|
159
|
+
url: url
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Registration
|
|
169
|
+
|
|
170
|
+
```ruby
|
|
171
|
+
# config/initializers/source_monitor.rb
|
|
172
|
+
SourceMonitor.configure do |config|
|
|
173
|
+
config.scrapers.register(:headless, "MyApp::Scrapers::Headless")
|
|
174
|
+
end
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Per-Source Settings
|
|
178
|
+
|
|
179
|
+
Override adapter defaults via the source's `scrape_settings` JSON column:
|
|
180
|
+
|
|
181
|
+
```ruby
|
|
182
|
+
source = SourceMonitor::Source.find(1)
|
|
183
|
+
source.update!(scrape_settings: {
|
|
184
|
+
render_service_url: "https://render.example.com/api/render",
|
|
185
|
+
wait_for_selector: ".article-content",
|
|
186
|
+
timeout: 60,
|
|
187
|
+
selectors: {
|
|
188
|
+
content: ".article-body",
|
|
189
|
+
title: ".article-title h1"
|
|
190
|
+
}
|
|
191
|
+
})
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Tests
|
|
195
|
+
|
|
196
|
+
```ruby
|
|
197
|
+
require "test_helper"
|
|
198
|
+
require "webmock/minitest"
|
|
199
|
+
|
|
200
|
+
class HeadlessScraperTest < ActiveSupport::TestCase
|
|
201
|
+
setup do
|
|
202
|
+
@source = create_source!
|
|
203
|
+
@item = @source.items.create!(
|
|
204
|
+
title: "Test Article",
|
|
205
|
+
url: "https://example.com/spa-article",
|
|
206
|
+
external_id: "headless-test-1"
|
|
207
|
+
)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
test "successfully renders and extracts content" do
|
|
211
|
+
stub_request(:post, "http://localhost:3001/render")
|
|
212
|
+
.to_return(
|
|
213
|
+
status: 200,
|
|
214
|
+
body: <<~HTML
|
|
215
|
+
<html>
|
|
216
|
+
<head><title>Test Page</title></head>
|
|
217
|
+
<body>
|
|
218
|
+
<article>
|
|
219
|
+
<h1>Article Title</h1>
|
|
220
|
+
<p>This is the article content.</p>
|
|
221
|
+
</article>
|
|
222
|
+
</body>
|
|
223
|
+
</html>
|
|
224
|
+
HTML
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
|
|
228
|
+
|
|
229
|
+
assert_equal :success, result.status
|
|
230
|
+
assert_includes result.content, "article content"
|
|
231
|
+
assert_equal "headless", result.metadata[:extraction_method]
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
test "returns failed when render service is down" do
|
|
235
|
+
stub_request(:post, "http://localhost:3001/render")
|
|
236
|
+
.to_return(status: 500, body: "Internal Server Error")
|
|
237
|
+
|
|
238
|
+
result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
|
|
239
|
+
|
|
240
|
+
assert_equal :failed, result.status
|
|
241
|
+
assert_equal "render_failed", result.metadata[:error]
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
test "returns partial when no content found" do
|
|
245
|
+
stub_request(:post, "http://localhost:3001/render")
|
|
246
|
+
.to_return(status: 200, body: "<html><body><nav>Nav only</nav></body></html>")
|
|
247
|
+
|
|
248
|
+
result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
|
|
249
|
+
|
|
250
|
+
assert_equal :partial, result.status
|
|
251
|
+
assert_nil result.content
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
test "handles missing URL" do
|
|
255
|
+
@item.update!(url: nil)
|
|
256
|
+
|
|
257
|
+
result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
|
|
258
|
+
|
|
259
|
+
assert_equal :failed, result.status
|
|
260
|
+
assert_equal "missing_url", result.metadata[:error]
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
test "merges source-level settings" do
|
|
264
|
+
@source.update!(scrape_settings: { timeout: 60 })
|
|
265
|
+
|
|
266
|
+
stub_request(:post, "http://localhost:3001/render")
|
|
267
|
+
.to_return(status: 200, body: "<html><body><article>Content</article></body></html>")
|
|
268
|
+
|
|
269
|
+
result = MyApp::Scrapers::Headless.call(item: @item, source: @source)
|
|
270
|
+
|
|
271
|
+
assert_equal :success, result.status
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
```
|
|
@@ -190,3 +190,105 @@
|
|
|
190
190
|
"title": "",
|
|
191
191
|
"message": "Claude Code needs your attention"
|
|
192
192
|
}
|
|
193
|
+
{
|
|
194
|
+
"timestamp": "2026-02-11T00:46:04Z",
|
|
195
|
+
"type": "idle_prompt",
|
|
196
|
+
"title": "",
|
|
197
|
+
"message": "Claude is waiting for your input"
|
|
198
|
+
}
|
|
199
|
+
{
|
|
200
|
+
"timestamp": "2026-02-11T00:59:14Z",
|
|
201
|
+
"type": "idle_prompt",
|
|
202
|
+
"title": "",
|
|
203
|
+
"message": "Claude is waiting for your input"
|
|
204
|
+
}
|
|
205
|
+
{
|
|
206
|
+
"timestamp": "2026-02-11T03:26:32Z",
|
|
207
|
+
"type": "idle_prompt",
|
|
208
|
+
"title": "",
|
|
209
|
+
"message": "Claude is waiting for your input"
|
|
210
|
+
}
|
|
211
|
+
{
|
|
212
|
+
"timestamp": "2026-02-11T03:46:35Z",
|
|
213
|
+
"type": "idle_prompt",
|
|
214
|
+
"title": "",
|
|
215
|
+
"message": "Claude is waiting for your input"
|
|
216
|
+
}
|
|
217
|
+
{
|
|
218
|
+
"timestamp": "2026-02-11T04:21:35Z",
|
|
219
|
+
"type": "permission_prompt",
|
|
220
|
+
"title": "",
|
|
221
|
+
"message": "Claude Code needs your attention"
|
|
222
|
+
}
|
|
223
|
+
{
|
|
224
|
+
"timestamp": "2026-02-11T04:28:53Z",
|
|
225
|
+
"type": "permission_prompt",
|
|
226
|
+
"title": "",
|
|
227
|
+
"message": "Claude Code needs your approval for the plan"
|
|
228
|
+
}
|
|
229
|
+
{
|
|
230
|
+
"timestamp": "2026-02-11T04:33:09Z",
|
|
231
|
+
"type": "permission_prompt",
|
|
232
|
+
"title": "",
|
|
233
|
+
"message": "Claude Code needs your approval for the plan"
|
|
234
|
+
}
|
|
235
|
+
{
|
|
236
|
+
"timestamp": "2026-02-11T04:46:56Z",
|
|
237
|
+
"type": "idle_prompt",
|
|
238
|
+
"title": "",
|
|
239
|
+
"message": "Claude is waiting for your input"
|
|
240
|
+
}
|
|
241
|
+
{
|
|
242
|
+
"timestamp": "2026-02-11T04:55:02Z",
|
|
243
|
+
"type": "permission_prompt",
|
|
244
|
+
"title": "",
|
|
245
|
+
"message": "Claude Code needs your approval for the plan"
|
|
246
|
+
}
|
|
247
|
+
{
|
|
248
|
+
"timestamp": "2026-02-11T05:06:47Z",
|
|
249
|
+
"type": "idle_prompt",
|
|
250
|
+
"title": "",
|
|
251
|
+
"message": "Claude is waiting for your input"
|
|
252
|
+
}
|
|
253
|
+
{
|
|
254
|
+
"timestamp": "2026-02-11T05:08:54Z",
|
|
255
|
+
"type": "idle_prompt",
|
|
256
|
+
"title": "",
|
|
257
|
+
"message": "Claude is waiting for your input"
|
|
258
|
+
}
|
|
259
|
+
{
|
|
260
|
+
"timestamp": "2026-02-11T05:11:34Z",
|
|
261
|
+
"type": "idle_prompt",
|
|
262
|
+
"title": "",
|
|
263
|
+
"message": "Claude is waiting for your input"
|
|
264
|
+
}
|
|
265
|
+
{
|
|
266
|
+
"timestamp": "2026-02-11T05:14:10Z",
|
|
267
|
+
"type": "idle_prompt",
|
|
268
|
+
"title": "",
|
|
269
|
+
"message": "Claude is waiting for your input"
|
|
270
|
+
}
|
|
271
|
+
{
|
|
272
|
+
"timestamp": "2026-02-11T05:15:43Z",
|
|
273
|
+
"type": "idle_prompt",
|
|
274
|
+
"title": "",
|
|
275
|
+
"message": "Claude is waiting for your input"
|
|
276
|
+
}
|
|
277
|
+
{
|
|
278
|
+
"timestamp": "2026-02-11T05:20:49Z",
|
|
279
|
+
"type": "idle_prompt",
|
|
280
|
+
"title": "",
|
|
281
|
+
"message": "Claude is waiting for your input"
|
|
282
|
+
}
|
|
283
|
+
{
|
|
284
|
+
"timestamp": "2026-02-11T05:24:55Z",
|
|
285
|
+
"type": "idle_prompt",
|
|
286
|
+
"title": "",
|
|
287
|
+
"message": "Claude is waiting for your input"
|
|
288
|
+
}
|
|
289
|
+
{
|
|
290
|
+
"timestamp": "2026-02-11T05:35:24Z",
|
|
291
|
+
"type": "idle_prompt",
|
|
292
|
+
"title": "",
|
|
293
|
+
"message": "Claude is waiting for your input"
|
|
294
|
+
}
|