source_monitor 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/CLAUDE.md +4 -4
- data/Gemfile.lock +1 -1
- data/VERSION +1 -1
- data/app/assets/builds/source_monitor/application.css +41 -0
- data/app/controllers/source_monitor/items_controller.rb +1 -1
- data/app/controllers/source_monitor/sources_controller.rb +52 -4
- data/app/jobs/source_monitor/scrape_item_job.rb +20 -0
- data/app/models/source_monitor/item_content.rb +29 -0
- data/app/models/source_monitor/source.rb +9 -1
- data/app/views/layouts/source_monitor/application.html.erb +1 -1
- data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +5 -5
- data/app/views/source_monitor/import_sessions/steps/_confirm.html.erb +2 -1
- data/app/views/source_monitor/items/_details.html.erb +9 -3
- data/app/views/source_monitor/items/index.html.erb +14 -2
- data/app/views/source_monitor/sources/_details.html.erb +14 -2
- data/app/views/source_monitor/sources/_row.html.erb +8 -0
- data/app/views/source_monitor/sources/index.html.erb +92 -14
- data/config/coverage_baseline.json +7 -0
- data/db/migrate/20260222120000_add_min_scrape_interval_to_sources.rb +7 -0
- data/db/migrate/20260222194201_add_word_counts_to_item_contents.rb +8 -0
- data/lib/source_monitor/configuration/scraping_settings.rb +15 -1
- data/lib/source_monitor/scraping/enqueuer.rb +38 -1
- data/lib/source_monitor/version.rb +1 -1
- data/lib/tasks/source_monitor_tasks.rake +14 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 37c1b9f849296d336eef1ced8584d8cad21caadb8c67606332ebd0f5db8b6fea
|
|
4
|
+
data.tar.gz: 36b8646b5f89a6b2b4317b9c914268b9ab399cd7b8cb4cb3aa9ee38809d7fb85
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d05d79d697e3d2889e0c4e48f8559601c8f43787d94db7d70e517badd606e2ced0252015b3d95bb3721b1acd3ef0bc1096c280c704c602f2a16269160322ef02
|
|
7
|
+
data.tar.gz: 607355cdf142b9fd7587004c570ece825ac30e1f40864b39fef934e347cf13c22ff09bda06b0f5bf8dd6e1984b3bcc4644d3667e22890c0d6d9bfe5fa28fcf09
|
data/CHANGELOG.md
CHANGED
|
@@ -15,6 +15,26 @@ All notable changes to this project are documented below. The format follows [Ke
|
|
|
15
15
|
|
|
16
16
|
- No unreleased changes yet.
|
|
17
17
|
|
|
18
|
+
## [0.9.0] - 2026-02-22
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- **Sources pagination and filtering.** Sources index now paginates (25 per page, configurable) with Previous/Next controls. Dropdown filters for Status, Health, Format, and Scraper Adapter auto-submit on change. Active filters shown as dismissible badges. Text search and dropdown filters compose as intersection and persist across pagination.
|
|
23
|
+
- **Per-source scrape rate limiting.** New `min_scrape_interval` column on sources allows time-based throttling between scrapes. Global default (1.0s) configurable via `config.scraping.min_scrape_interval`. Per-source overrides via the column value. ScrapeItemJob and Enqueuer check last scrape time from `scrape_logs` and re-enqueue with delay when rate-limited.
|
|
24
|
+
- **Word count metrics.** New `feed_word_count` and `scraped_word_count` columns on `item_contents`. Feed content is HTML-stripped before counting; scraped content counted as-is (readability-cleaned). Separate "Avg Feed Words" and "Avg Scraped Words" columns on sources index. Separate "Feed Words" and "Scraped Words" columns on items index and source detail items table. Backfill rake task: `source_monitor:backfill_word_counts`.
|
|
25
|
+
|
|
26
|
+
### Fixed
|
|
27
|
+
|
|
28
|
+
- Show `created_at` fallback when `published_at` is nil in items table.
|
|
29
|
+
- Handle source destroy failures with proper error responses instead of silent failures.
|
|
30
|
+
- UI fixes: navigation warning indicator positioning, toast container placement, dashboard table alignment.
|
|
31
|
+
- N+1 query fix: source detail items table now uses `includes(:item_content)`.
|
|
32
|
+
|
|
33
|
+
### Testing
|
|
34
|
+
|
|
35
|
+
- 1,175 tests, 3,683 assertions, 0 failures.
|
|
36
|
+
- RuboCop: 0 offenses (423 files).
|
|
37
|
+
|
|
18
38
|
## [0.8.1] - 2026-02-21
|
|
19
39
|
|
|
20
40
|
### Fixed
|
data/CLAUDE.md
CHANGED
|
@@ -4,10 +4,10 @@
|
|
|
4
4
|
|
|
5
5
|
## Active Context
|
|
6
6
|
|
|
7
|
-
**Milestone:** (
|
|
8
|
-
**
|
|
9
|
-
**Previous:**
|
|
10
|
-
**Next action:** /vbw:vibe to
|
|
7
|
+
**Milestone:** polish-and-reliability (extended)
|
|
8
|
+
**Phase:** 4 of 5 -- Bug Fixes & Polish (pending planning)
|
|
9
|
+
**Previous phases:** Backend Fixes, Favicon Support, Toast Stacking (all complete)
|
|
10
|
+
**Next action:** /vbw:vibe to plan and execute Phase 4
|
|
11
11
|
|
|
12
12
|
## Key Decisions
|
|
13
13
|
|
data/Gemfile.lock
CHANGED
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.9.0
|
|
@@ -675,6 +675,10 @@ video {
|
|
|
675
675
|
top: 1rem;
|
|
676
676
|
}
|
|
677
677
|
|
|
678
|
+
.fm-admin .top-16 {
|
|
679
|
+
top: 4rem;
|
|
680
|
+
}
|
|
681
|
+
|
|
678
682
|
.fm-admin .z-10 {
|
|
679
683
|
z-index: 10;
|
|
680
684
|
}
|
|
@@ -741,6 +745,14 @@ video {
|
|
|
741
745
|
margin-top: 1.5rem;
|
|
742
746
|
}
|
|
743
747
|
|
|
748
|
+
.fm-admin .mb-1 {
|
|
749
|
+
margin-bottom: 0.25rem;
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
.fm-admin .ml-0\.5 {
|
|
753
|
+
margin-left: 0.125rem;
|
|
754
|
+
}
|
|
755
|
+
|
|
744
756
|
.fm-admin .block {
|
|
745
757
|
display: block;
|
|
746
758
|
}
|
|
@@ -853,10 +865,30 @@ video {
|
|
|
853
865
|
width: 100%;
|
|
854
866
|
}
|
|
855
867
|
|
|
868
|
+
.fm-admin .w-\[15\%\] {
|
|
869
|
+
width: 15%;
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
.fm-admin .w-\[18\%\] {
|
|
873
|
+
width: 18%;
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
.fm-admin .w-\[22\%\] {
|
|
877
|
+
width: 22%;
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
.fm-admin .w-\[45\%\] {
|
|
881
|
+
width: 45%;
|
|
882
|
+
}
|
|
883
|
+
|
|
856
884
|
.fm-admin .min-w-full {
|
|
857
885
|
min-width: 100%;
|
|
858
886
|
}
|
|
859
887
|
|
|
888
|
+
.fm-admin .min-w-\[12rem\] {
|
|
889
|
+
min-width: 12rem;
|
|
890
|
+
}
|
|
891
|
+
|
|
860
892
|
.fm-admin .max-w-2xl {
|
|
861
893
|
max-width: 42rem;
|
|
862
894
|
}
|
|
@@ -1579,6 +1611,10 @@ video {
|
|
|
1579
1611
|
font-weight: 600;
|
|
1580
1612
|
}
|
|
1581
1613
|
|
|
1614
|
+
.fm-admin .font-bold {
|
|
1615
|
+
font-weight: 700;
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1582
1618
|
.fm-admin .uppercase {
|
|
1583
1619
|
text-transform: uppercase;
|
|
1584
1620
|
}
|
|
@@ -1927,6 +1963,11 @@ video {
|
|
|
1927
1963
|
color: rgb(255 255 255 / var(--tw-text-opacity, 1));
|
|
1928
1964
|
}
|
|
1929
1965
|
|
|
1966
|
+
.fm-admin .hover\:text-blue-700:hover {
|
|
1967
|
+
--tw-text-opacity: 1;
|
|
1968
|
+
color: rgb(29 78 216 / var(--tw-text-opacity, 1));
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1930
1971
|
.fm-admin .hover\:underline:hover {
|
|
1931
1972
|
text-decoration-line: underline;
|
|
1932
1973
|
}
|
|
@@ -5,7 +5,7 @@ module SourceMonitor
|
|
|
5
5
|
include ActionView::RecordIdentifier
|
|
6
6
|
include SourceMonitor::SanitizesSearchParams
|
|
7
7
|
|
|
8
|
-
searchable_with scope: -> { Item.active.includes(:source) }, default_sorts: [ "published_at desc", "created_at desc" ]
|
|
8
|
+
searchable_with scope: -> { Item.active.includes(:source, :item_content) }, default_sorts: [ "published_at desc", "created_at desc" ]
|
|
9
9
|
|
|
10
10
|
PER_PAGE = 25
|
|
11
11
|
SEARCH_FIELD = :title_or_summary_or_url_or_source_name_cont
|
|
@@ -12,6 +12,7 @@ module SourceMonitor
|
|
|
12
12
|
searchable_with scope: -> { Source.all }, default_sorts: [ "created_at desc" ]
|
|
13
13
|
|
|
14
14
|
ITEMS_PREVIEW_LIMIT = SourceMonitor::Scraping::BulkSourceScraper::DEFAULT_PREVIEW_LIMIT
|
|
15
|
+
PER_PAGE = 25
|
|
15
16
|
|
|
16
17
|
before_action :set_source, only: %i[show edit update destroy]
|
|
17
18
|
|
|
@@ -21,14 +22,23 @@ module SourceMonitor
|
|
|
21
22
|
@search_params = sanitized_search_params
|
|
22
23
|
@q = build_search_query
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
paginator = SourceMonitor::Pagination::Paginator.new(
|
|
26
|
+
scope: @q.result,
|
|
27
|
+
page: params[:page],
|
|
28
|
+
per_page: params[:per_page] || PER_PAGE
|
|
29
|
+
).paginate
|
|
30
|
+
|
|
31
|
+
@sources = paginator.records
|
|
32
|
+
@page = paginator.page
|
|
33
|
+
@has_next_page = paginator.has_next_page
|
|
34
|
+
@has_previous_page = paginator.has_previous_page
|
|
25
35
|
|
|
26
36
|
@search_term = @search_params[SEARCH_FIELD.to_s].to_s.strip
|
|
27
37
|
@search_field = SEARCH_FIELD
|
|
28
38
|
|
|
29
39
|
metrics = SourceMonitor::Analytics::SourcesIndexMetrics.new(
|
|
30
40
|
base_scope: Source.all,
|
|
31
|
-
result_scope:
|
|
41
|
+
result_scope: paginator.records,
|
|
32
42
|
search_params: @search_params
|
|
33
43
|
)
|
|
34
44
|
|
|
@@ -38,12 +48,26 @@ module SourceMonitor
|
|
|
38
48
|
@fetch_interval_filter = metrics.fetch_interval_filter
|
|
39
49
|
@selected_fetch_interval_bucket = metrics.selected_fetch_interval_bucket
|
|
40
50
|
@item_activity_rates = metrics.item_activity_rates
|
|
51
|
+
|
|
52
|
+
source_ids = @sources.map(&:id)
|
|
53
|
+
if source_ids.any?
|
|
54
|
+
base = ItemContent.joins(:item).where(sourcemon_items: { source_id: source_ids })
|
|
55
|
+
@avg_feed_word_counts = base.where.not(feed_word_count: nil)
|
|
56
|
+
.group("sourcemon_items.source_id")
|
|
57
|
+
.average(:feed_word_count)
|
|
58
|
+
@avg_scraped_word_counts = base.where.not(scraped_word_count: nil)
|
|
59
|
+
.group("sourcemon_items.source_id")
|
|
60
|
+
.average(:scraped_word_count)
|
|
61
|
+
else
|
|
62
|
+
@avg_feed_word_counts = {}
|
|
63
|
+
@avg_scraped_word_counts = {}
|
|
64
|
+
end
|
|
41
65
|
end
|
|
42
66
|
|
|
43
67
|
def show
|
|
44
68
|
@recent_fetch_logs = @source.fetch_logs.order(started_at: :desc).limit(5)
|
|
45
69
|
@recent_scrape_logs = @source.scrape_logs.order(started_at: :desc).limit(5)
|
|
46
|
-
@items = @source.items.recent.limit(ITEMS_PREVIEW_LIMIT)
|
|
70
|
+
@items = @source.items.recent.includes(:item_content).limit(ITEMS_PREVIEW_LIMIT)
|
|
47
71
|
@bulk_scrape_selection = :current
|
|
48
72
|
end
|
|
49
73
|
|
|
@@ -75,7 +99,17 @@ module SourceMonitor
|
|
|
75
99
|
|
|
76
100
|
def destroy
|
|
77
101
|
search_params = sanitized_search_params
|
|
78
|
-
|
|
102
|
+
|
|
103
|
+
begin
|
|
104
|
+
unless @source.destroy
|
|
105
|
+
handle_destroy_failure(search_params, "Could not delete source: #{@source.errors.full_messages.join(', ')}")
|
|
106
|
+
return
|
|
107
|
+
end
|
|
108
|
+
rescue ActiveRecord::InvalidForeignKey
|
|
109
|
+
handle_destroy_failure(search_params, "Cannot delete source: other records still reference it. Remove dependent records first.")
|
|
110
|
+
return
|
|
111
|
+
end
|
|
112
|
+
|
|
79
113
|
message = "Source deleted"
|
|
80
114
|
|
|
81
115
|
respond_to do |format|
|
|
@@ -132,6 +166,20 @@ module SourceMonitor
|
|
|
132
166
|
sanitized.start_with?("/") ? sanitized : nil
|
|
133
167
|
end
|
|
134
168
|
|
|
169
|
+
def handle_destroy_failure(search_params, error_message)
|
|
170
|
+
respond_to do |format|
|
|
171
|
+
format.turbo_stream do
|
|
172
|
+
responder = SourceMonitor::TurboStreams::StreamResponder.new
|
|
173
|
+
responder.toast(message: error_message, level: :error)
|
|
174
|
+
render turbo_stream: responder.render(view_context), status: :unprocessable_entity
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
format.html do
|
|
178
|
+
redirect_to source_monitor.sources_path(q: search_params), alert: error_message
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
135
183
|
def enqueue_favicon_fetch(source)
|
|
136
184
|
return unless defined?(ActiveStorage)
|
|
137
185
|
return unless SourceMonitor.config.favicons.enabled?
|
|
@@ -18,6 +18,14 @@ module SourceMonitor
|
|
|
18
18
|
return
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
+
remaining = time_until_scrape_allowed(source)
|
|
22
|
+
if remaining&.positive?
|
|
23
|
+
SourceMonitor::Scraping::State.clear_inflight!(item)
|
|
24
|
+
self.class.set(wait: remaining.seconds).perform_later(item_id)
|
|
25
|
+
log("job:deferred", item: item, wait_seconds: remaining)
|
|
26
|
+
return
|
|
27
|
+
end
|
|
28
|
+
|
|
21
29
|
SourceMonitor::Scraping::State.mark_processing!(item)
|
|
22
30
|
SourceMonitor::Scraping::ItemScraper.new(item:, source:).call
|
|
23
31
|
log("job:completed", item: item, status: item.scrape_status)
|
|
@@ -31,6 +39,18 @@ module SourceMonitor
|
|
|
31
39
|
|
|
32
40
|
private
|
|
33
41
|
|
|
42
|
+
def time_until_scrape_allowed(source)
|
|
43
|
+
interval = source.min_scrape_interval || SourceMonitor.config.scraping.min_scrape_interval
|
|
44
|
+
return nil if interval.nil? || interval <= 0
|
|
45
|
+
|
|
46
|
+
last_scrape_at = source.scrape_logs.maximum(:started_at)
|
|
47
|
+
return nil unless last_scrape_at
|
|
48
|
+
|
|
49
|
+
elapsed = Time.current - last_scrape_at
|
|
50
|
+
remaining = interval - elapsed
|
|
51
|
+
remaining.positive? ? remaining.ceil : nil
|
|
52
|
+
end
|
|
53
|
+
|
|
34
54
|
def log(stage, item: nil, item_id: nil, **extra)
|
|
35
55
|
return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
36
56
|
|
|
@@ -8,6 +8,35 @@ module SourceMonitor
|
|
|
8
8
|
|
|
9
9
|
has_many_attached :images if defined?(ActiveStorage)
|
|
10
10
|
|
|
11
|
+
before_save :compute_word_counts
|
|
12
|
+
|
|
11
13
|
SourceMonitor::ModelExtensions.register(self, :item_content)
|
|
14
|
+
|
|
15
|
+
def total_word_count
|
|
16
|
+
[ scraped_word_count, feed_word_count ].compact.max || 0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def compute_word_counts
|
|
22
|
+
compute_scraped_word_count
|
|
23
|
+
compute_feed_word_count
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def compute_scraped_word_count
|
|
27
|
+
return unless scraped_content_changed? || new_record? || (scraped_word_count.nil? && scraped_content.present?)
|
|
28
|
+
|
|
29
|
+
self.scraped_word_count = scraped_content.present? ? scraped_content.split.size : nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def compute_feed_word_count
|
|
33
|
+
content = item&.content
|
|
34
|
+
if content.blank?
|
|
35
|
+
self.feed_word_count = nil
|
|
36
|
+
else
|
|
37
|
+
stripped = ActionView::Base.full_sanitizer.sanitize(content)
|
|
38
|
+
self.feed_word_count = stripped.present? ? stripped.split.size : nil
|
|
39
|
+
end
|
|
40
|
+
end
|
|
12
41
|
end
|
|
13
42
|
end
|
|
@@ -62,7 +62,8 @@ module SourceMonitor
|
|
|
62
62
|
end
|
|
63
63
|
|
|
64
64
|
def ransackable_attributes(_auth_object = nil)
|
|
65
|
-
%w[name feed_url website_url created_at fetch_interval_minutes items_count last_fetched_at
|
|
65
|
+
%w[name feed_url website_url created_at fetch_interval_minutes items_count last_fetched_at
|
|
66
|
+
active health_status feed_format scraper_adapter]
|
|
66
67
|
end
|
|
67
68
|
|
|
68
69
|
def ransackable_associations(_auth_object = nil)
|
|
@@ -103,6 +104,13 @@ module SourceMonitor
|
|
|
103
104
|
update_columns(items_count: actual_count)
|
|
104
105
|
end
|
|
105
106
|
|
|
107
|
+
def avg_word_count
|
|
108
|
+
items.joins(:item_content)
|
|
109
|
+
.where.not(sourcemon_item_contents: { scraped_word_count: nil })
|
|
110
|
+
.average("sourcemon_item_contents.scraped_word_count")
|
|
111
|
+
&.round
|
|
112
|
+
end
|
|
113
|
+
|
|
106
114
|
private
|
|
107
115
|
|
|
108
116
|
def health_auto_pause_threshold_within_bounds
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
</head>
|
|
14
14
|
<body class="fm-admin">
|
|
15
15
|
<%= turbo_stream_from "source_monitor_notifications" %>
|
|
16
|
-
<div class="pointer-events-none fixed inset-x-0 top-
|
|
16
|
+
<div class="pointer-events-none fixed inset-x-0 top-16 z-50 flex justify-end px-6"
|
|
17
17
|
data-controller="notification-container">
|
|
18
18
|
<div class="flex w-full max-w-sm flex-col items-end gap-3">
|
|
19
19
|
<div id="source_monitor_notifications"
|
|
@@ -25,13 +25,13 @@
|
|
|
25
25
|
|
|
26
26
|
<% if group.sources.any? %>
|
|
27
27
|
<div class="overflow-x-auto px-5 py-4">
|
|
28
|
-
<table class="min-w-full divide-y divide-slate-200 text-left text-sm">
|
|
28
|
+
<table class="min-w-full table-fixed divide-y divide-slate-200 text-left text-sm">
|
|
29
29
|
<thead class="text-xs font-semibold uppercase tracking-wide text-slate-500">
|
|
30
30
|
<tr>
|
|
31
|
-
<th scope="col" class="px-4 py-2">Source</th>
|
|
32
|
-
<th scope="col" class="px-4 py-2">Status</th>
|
|
33
|
-
<th scope="col" class="px-4 py-2">Next Fetch</th>
|
|
34
|
-
<th scope="col" class="px-4 py-2">Interval</th>
|
|
31
|
+
<th scope="col" class="w-[45%] px-4 py-2">Source</th>
|
|
32
|
+
<th scope="col" class="w-[15%] px-4 py-2">Status</th>
|
|
33
|
+
<th scope="col" class="w-[22%] px-4 py-2">Next Fetch</th>
|
|
34
|
+
<th scope="col" class="w-[18%] px-4 py-2">Interval</th>
|
|
35
35
|
</tr>
|
|
36
36
|
</thead>
|
|
37
37
|
<tbody class="divide-y divide-slate-100 text-slate-700">
|
|
@@ -112,7 +112,8 @@
|
|
|
112
112
|
|
|
113
113
|
<%= form_with model: import_session,
|
|
114
114
|
url: source_monitor.step_import_session_path(import_session, step: "confirm"),
|
|
115
|
-
method: :patch
|
|
115
|
+
method: :patch,
|
|
116
|
+
data: { action: "submit->confirm-navigation#disable" } do |form| %>
|
|
116
117
|
<%= hidden_field_tag :next_step, "confirm", name: "import_session[next_step]" %>
|
|
117
118
|
<%= form.submit "Start import", class: "inline-flex items-center rounded-md bg-blue-600 px-4 py-2 text-sm font-semibold text-white shadow hover:bg-blue-500" %>
|
|
118
119
|
<% end %>
|
|
@@ -36,7 +36,11 @@
|
|
|
36
36
|
<span class="text-slate-400">No source</span>
|
|
37
37
|
<% end %>
|
|
38
38
|
·
|
|
39
|
-
|
|
39
|
+
<% if item.published_at %>
|
|
40
|
+
<%= item.published_at.strftime("%b %d, %Y %H:%M %Z") %>
|
|
41
|
+
<% else %>
|
|
42
|
+
<span class="text-slate-400"><%= item.created_at.strftime("%b %d, %Y %H:%M %Z") %></span>
|
|
43
|
+
<% end %>
|
|
40
44
|
</p>
|
|
41
45
|
</div>
|
|
42
46
|
</div>
|
|
@@ -56,7 +60,7 @@
|
|
|
56
60
|
"URL" => (item.url.present? ? external_link_to(item.url, item.url, class: "text-slate-900 hover:text-blue-500") : "\u2014"),
|
|
57
61
|
"Canonical URL" => (item.canonical_url.present? ? external_link_to(item.canonical_url, item.canonical_url, class: "text-slate-900 hover:text-blue-500") : "\u2014"),
|
|
58
62
|
"Author" => item.author || "—",
|
|
59
|
-
"Published At" => (item.published_at&.strftime("%b %d, %Y %H:%M %Z") || "
|
|
63
|
+
"Published At" => (item.published_at&.strftime("%b %d, %Y %H:%M %Z") || item.created_at.strftime("%b %d, %Y %H:%M %Z")),
|
|
60
64
|
"Updated At (Source)" => (item.updated_at_source&.strftime("%b %d, %Y %H:%M %Z") || "—"),
|
|
61
65
|
"Language" => item.language || "—",
|
|
62
66
|
"Categories" => categories_list.present? ? categories_list.join(", ") : "—",
|
|
@@ -126,7 +130,9 @@
|
|
|
126
130
|
</div>
|
|
127
131
|
<div class="space-y-2 px-5 py-4 text-sm text-slate-700">
|
|
128
132
|
<p><span class="font-medium text-slate-600">Comments:</span> <%= item.comments_count || 0 %></p>
|
|
129
|
-
<p><span class="font-medium text-slate-600">Feed Items in Source:</span> <%= source&.items_count || "
|
|
133
|
+
<p><span class="font-medium text-slate-600">Feed Items in Source:</span> <%= source&.items_count || "\u2014" %></p>
|
|
134
|
+
<p><span class="font-medium text-slate-600">Feed Word Count:</span> <%= item.item_content&.feed_word_count || "\u2014" %></p>
|
|
135
|
+
<p><span class="font-medium text-slate-600">Scraped Word Count:</span> <%= item.item_content&.scraped_word_count || "\u2014" %></p>
|
|
130
136
|
</div>
|
|
131
137
|
</div>
|
|
132
138
|
|
|
@@ -64,6 +64,8 @@
|
|
|
64
64
|
</span>
|
|
65
65
|
</th>
|
|
66
66
|
<th scope="col" class="px-6 py-3">Scrape Status</th>
|
|
67
|
+
<th scope="col" class="px-6 py-3">Feed Words</th>
|
|
68
|
+
<th scope="col" class="px-6 py-3">Scraped Words</th>
|
|
67
69
|
</tr>
|
|
68
70
|
</thead>
|
|
69
71
|
<tbody class="divide-y divide-slate-100 text-slate-700">
|
|
@@ -91,7 +93,11 @@
|
|
|
91
93
|
<% end %>
|
|
92
94
|
</td>
|
|
93
95
|
<td class="px-6 py-4 text-xs text-slate-500">
|
|
94
|
-
|
|
96
|
+
<% if item.published_at %>
|
|
97
|
+
<%= item.published_at.strftime("%b %d, %Y %H:%M") %>
|
|
98
|
+
<% else %>
|
|
99
|
+
<span class="text-slate-400"><%= item.created_at.strftime("%b %d, %Y %H:%M") %></span>
|
|
100
|
+
<% end %>
|
|
95
101
|
</td>
|
|
96
102
|
<td class="px-6 py-4 text-xs">
|
|
97
103
|
<% status_label, status_classes =
|
|
@@ -107,12 +113,18 @@
|
|
|
107
113
|
end %>
|
|
108
114
|
<span class="inline-flex items-center rounded-full px-3 py-1 font-semibold <%= status_classes %>"><%= status_label %></span>
|
|
109
115
|
</td>
|
|
116
|
+
<td class="px-6 py-4 text-xs text-slate-500">
|
|
117
|
+
<%= item.item_content&.feed_word_count || "\u2014" %>
|
|
118
|
+
</td>
|
|
119
|
+
<td class="px-6 py-4 text-xs text-slate-500">
|
|
120
|
+
<%= item.item_content&.scraped_word_count || "\u2014" %>
|
|
121
|
+
</td>
|
|
110
122
|
</tr>
|
|
111
123
|
<% end %>
|
|
112
124
|
|
|
113
125
|
<% if @items.blank? %>
|
|
114
126
|
<tr>
|
|
115
|
-
<td colspan="
|
|
127
|
+
<td colspan="6" class="px-6 py-6 text-center text-sm text-slate-500">
|
|
116
128
|
No items found.
|
|
117
129
|
</td>
|
|
118
130
|
</tr>
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
<% recent_fetch_logs = source.fetch_logs.order(started_at: :desc).limit(5) %>
|
|
3
3
|
<% recent_scrape_logs = source.scrape_logs.order(started_at: :desc).limit(5) %>
|
|
4
4
|
<% preview_limit = SourceMonitor::Scraping::BulkSourceScraper::DEFAULT_PREVIEW_LIMIT %>
|
|
5
|
-
<% items = source.items.recent.limit(preview_limit) %>
|
|
5
|
+
<% items = source.items.recent.includes(:item_content).limit(preview_limit) %>
|
|
6
6
|
<% fetch_status = async_status_badge(source.fetch_status) %>
|
|
7
7
|
<% health_status_override = local_assigns[:health_status_override] %>
|
|
8
8
|
|
|
@@ -281,6 +281,8 @@
|
|
|
281
281
|
<th scope="col" class="px-5 py-3">Tags</th>
|
|
282
282
|
<th scope="col" class="px-5 py-3">Published</th>
|
|
283
283
|
<th scope="col" class="px-5 py-3">Scrape Status</th>
|
|
284
|
+
<th scope="col" class="px-5 py-3">Feed Words</th>
|
|
285
|
+
<th scope="col" class="px-5 py-3">Scraped Words</th>
|
|
284
286
|
</tr>
|
|
285
287
|
</thead>
|
|
286
288
|
<tbody class="divide-y divide-slate-100 text-slate-700">
|
|
@@ -299,7 +301,11 @@
|
|
|
299
301
|
<td class="px-5 py-4 text-xs text-slate-500"><%= categories.present? ? categories.join(", ") : "—" %></td>
|
|
300
302
|
<td class="px-5 py-4 text-xs text-slate-500"><%= tags.present? ? tags.join(", ") : "—" %></td>
|
|
301
303
|
<td class="px-5 py-4 text-xs text-slate-500">
|
|
302
|
-
|
|
304
|
+
<% if item.published_at %>
|
|
305
|
+
<%= item.published_at.strftime("%b %d, %Y %H:%M") %>
|
|
306
|
+
<% else %>
|
|
307
|
+
<span class="text-slate-400"><%= item.created_at.strftime("%b %d, %Y %H:%M") %></span>
|
|
308
|
+
<% end %>
|
|
303
309
|
</td>
|
|
304
310
|
<td class="px-5 py-4 text-xs">
|
|
305
311
|
<% scrape_badge = item_scrape_status_badge(item: item, source: source) %>
|
|
@@ -312,6 +318,12 @@
|
|
|
312
318
|
<%= scrape_badge[:label] %>
|
|
313
319
|
</span>
|
|
314
320
|
</td>
|
|
321
|
+
<td class="px-5 py-4 text-xs text-slate-500">
|
|
322
|
+
<%= item.item_content&.feed_word_count || "\u2014" %>
|
|
323
|
+
</td>
|
|
324
|
+
<td class="px-5 py-4 text-xs text-slate-500">
|
|
325
|
+
<%= item.item_content&.scraped_word_count || "\u2014" %>
|
|
326
|
+
</td>
|
|
315
327
|
</tr>
|
|
316
328
|
<% end %>
|
|
317
329
|
</tbody>
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
<% rate_map = local_assigns[:item_activity_rates] || {} %>
|
|
2
|
+
<% avg_feed_words_map = local_assigns[:avg_feed_word_counts] || {} %>
|
|
3
|
+
<% avg_scraped_words_map = local_assigns[:avg_scraped_word_counts] || {} %>
|
|
2
4
|
<% activity_rate = rate_map.fetch(source.id, 0.0) %>
|
|
3
5
|
<% health_status_override = local_assigns[:health_status_override] %>
|
|
4
6
|
<% health_status = if !source.active?
|
|
@@ -66,6 +68,12 @@
|
|
|
66
68
|
<%= number_with_precision(activity_rate, precision: 2) %>
|
|
67
69
|
<span class="text-xs text-slate-500">/ day</span>
|
|
68
70
|
</td>
|
|
71
|
+
<td class="px-6 py-4 text-sm text-slate-500">
|
|
72
|
+
<%= avg_feed_words_map[source.id]&.round || "\u2014" %>
|
|
73
|
+
</td>
|
|
74
|
+
<td class="px-6 py-4 text-sm text-slate-500">
|
|
75
|
+
<%= avg_scraped_words_map[source.id]&.round || "\u2014" %>
|
|
76
|
+
</td>
|
|
69
77
|
<td class="px-6 py-4 text-xs text-slate-500">
|
|
70
78
|
<%= source.last_fetched_at ? source.last_fetched_at.strftime("%b %d, %H:%M") : "Never" %>
|
|
71
79
|
</td>
|
|
@@ -5,22 +5,42 @@
|
|
|
5
5
|
<h1 class="text-3xl font-semibold text-slate-900">Sources</h1>
|
|
6
6
|
<p class="mt-1 text-sm text-slate-500">Manage feed endpoints and scraping settings.</p>
|
|
7
7
|
</div>
|
|
8
|
-
<div class="flex
|
|
9
|
-
<%=
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
<%= form.search_field @search_field, placeholder: "Search name or URL…", class: "w-full rounded-l-md border border-slate-200 bg-white px-3 py-2 text-sm text-slate-700 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500 sm:min-w-[16rem]" %>
|
|
13
|
-
<%= form.submit "Search", class: "rounded-r-md bg-blue-600 px-4 py-2 text-sm font-semibold text-white hover:bg-blue-500" %>
|
|
14
|
-
</div>
|
|
15
|
-
<% end %>
|
|
16
|
-
<div class="flex flex-col gap-2 sm:flex-row sm:items-center sm:gap-3">
|
|
17
|
-
<%= link_to "New Source", source_monitor.new_source_path, class: "inline-flex items-center justify-center rounded-md bg-blue-600 px-4 py-2 text-sm font-semibold text-white shadow hover:bg-blue-500" %>
|
|
18
|
-
<%= link_to "Import OPML", source_monitor.new_import_session_path,
|
|
19
|
-
class: "inline-flex items-center justify-center rounded-md border border-slate-200 px-4 py-2 text-sm font-semibold text-slate-700 shadow-sm hover:bg-slate-50" %>
|
|
20
|
-
</div>
|
|
8
|
+
<div class="flex flex-col gap-2 sm:flex-row sm:items-center sm:gap-3">
|
|
9
|
+
<%= link_to "New Source", source_monitor.new_source_path, class: "inline-flex items-center justify-center rounded-md bg-blue-600 px-4 py-2 text-sm font-semibold text-white shadow hover:bg-blue-500" %>
|
|
10
|
+
<%= link_to "Import OPML", source_monitor.new_import_session_path,
|
|
11
|
+
class: "inline-flex items-center justify-center rounded-md border border-slate-200 px-4 py-2 text-sm font-semibold text-slate-700 shadow-sm hover:bg-slate-50" %>
|
|
21
12
|
</div>
|
|
22
13
|
</div>
|
|
23
14
|
|
|
15
|
+
<%= search_form_for @q, url: source_monitor.sources_path, method: :get, html: { class: "flex flex-wrap items-end gap-3", data: { turbo_frame: "source_monitor_sources_table" } } do |form| %>
|
|
16
|
+
<div class="flex-1 min-w-[12rem]">
|
|
17
|
+
<%= form.label @search_field, "Search sources", class: "sr-only" %>
|
|
18
|
+
<div class="flex rounded-md shadow-sm">
|
|
19
|
+
<%= form.search_field @search_field, placeholder: "Search name or URL…", class: "w-full rounded-l-md border border-slate-200 bg-white px-3 py-2 text-sm text-slate-700 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500" %>
|
|
20
|
+
<%= form.submit "Search", class: "rounded-r-md bg-blue-600 px-4 py-2 text-sm font-semibold text-white hover:bg-blue-500" %>
|
|
21
|
+
</div>
|
|
22
|
+
</div>
|
|
23
|
+
<div class="flex flex-wrap items-end gap-2">
|
|
24
|
+
<div>
|
|
25
|
+
<%= form.label :active_eq, "Status", class: "block text-xs font-medium text-slate-500 mb-1" %>
|
|
26
|
+
<%= form.select :active_eq, options_for_select([["All Statuses", ""], ["Active", "true"], ["Paused", "false"]], @search_params["active_eq"].to_s), {}, class: "rounded-md border border-slate-200 bg-white px-2 py-2 text-sm text-slate-700 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500", onchange: "this.form.requestSubmit()" %>
|
|
27
|
+
</div>
|
|
28
|
+
<div>
|
|
29
|
+
<%= form.label :health_status_eq, "Health", class: "block text-xs font-medium text-slate-500 mb-1" %>
|
|
30
|
+
<%= form.select :health_status_eq, options_for_select([["All Health", ""], ["Healthy", "healthy"], ["Warning", "warning"], ["Declining", "declining"], ["Critical", "critical"]], @search_params["health_status_eq"].to_s), {}, class: "rounded-md border border-slate-200 bg-white px-2 py-2 text-sm text-slate-700 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500", onchange: "this.form.requestSubmit()" %>
|
|
31
|
+
</div>
|
|
32
|
+
<div>
|
|
33
|
+
<%= form.label :feed_format_eq, "Format", class: "block text-xs font-medium text-slate-500 mb-1" %>
|
|
34
|
+
<%= form.select :feed_format_eq, options_for_select([["All Formats", ""], ["RSS", "rss"], ["Atom", "atom"], ["JSON", "json"]], @search_params["feed_format_eq"].to_s), {}, class: "rounded-md border border-slate-200 bg-white px-2 py-2 text-sm text-slate-700 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500", onchange: "this.form.requestSubmit()" %>
|
|
35
|
+
</div>
|
|
36
|
+
<div>
|
|
37
|
+
<% adapter_options = SourceMonitor::Source.distinct.where.not(scraper_adapter: [nil, ""]).order(:scraper_adapter).pluck(:scraper_adapter) %>
|
|
38
|
+
<%= form.label :scraper_adapter_eq, "Adapter", class: "block text-xs font-medium text-slate-500 mb-1" %>
|
|
39
|
+
<%= form.select :scraper_adapter_eq, options_for_select([["All Adapters", ""]] + adapter_options.map { |a| [a.titleize, a] }, @search_params["scraper_adapter_eq"].to_s), {}, class: "rounded-md border border-slate-200 bg-white px-2 py-2 text-sm text-slate-700 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500", onchange: "this.form.requestSubmit()" %>
|
|
40
|
+
</div>
|
|
41
|
+
</div>
|
|
42
|
+
<% end %>
|
|
43
|
+
|
|
24
44
|
<%= render "source_monitor/sources/import_history_panel", import_histories: @recent_import_histories %>
|
|
25
45
|
|
|
26
46
|
<%= render "source_monitor/sources/fetch_interval_heatmap",
|
|
@@ -30,7 +50,10 @@
|
|
|
30
50
|
|
|
31
51
|
<div class="overflow-hidden rounded-lg border border-slate-200 bg-white shadow-sm">
|
|
32
52
|
<%= turbo_frame_tag "source_monitor_sources_table" do %>
|
|
33
|
-
<%
|
|
53
|
+
<% dropdown_filter_keys = %w[active_eq health_status_eq feed_format_eq scraper_adapter_eq] %>
|
|
54
|
+
<% active_dropdown_filters = dropdown_filter_keys.select { |k| @search_params[k].present? } %>
|
|
55
|
+
<% has_any_filter = @search_term.present? || @fetch_interval_filter.present? || active_dropdown_filters.any? %>
|
|
56
|
+
<% if has_any_filter %>
|
|
34
57
|
<div class="rounded-t-lg border-b border-blue-100 bg-blue-50 px-4 py-3 text-xs text-blue-700">
|
|
35
58
|
<% if @search_term.present? %>
|
|
36
59
|
<% clear_search_query = @search_params.dup %>
|
|
@@ -56,6 +79,32 @@
|
|
|
56
79
|
class: "font-medium text-blue-600 hover:text-blue-500" %>
|
|
57
80
|
</div>
|
|
58
81
|
<% end %>
|
|
82
|
+
|
|
83
|
+
<% if active_dropdown_filters.any? %>
|
|
84
|
+
<div class="mt-1 flex flex-wrap items-center gap-2">
|
|
85
|
+
<span>Filtered by</span>
|
|
86
|
+
<% filter_labels = {
|
|
87
|
+
"active_eq" => @search_params["active_eq"] == "true" ? "Status: Active" : "Status: Paused",
|
|
88
|
+
"health_status_eq" => "Health: #{@search_params['health_status_eq']&.titleize}",
|
|
89
|
+
"feed_format_eq" => "Format: #{@search_params['feed_format_eq']&.upcase}",
|
|
90
|
+
"scraper_adapter_eq" => "Adapter: #{@search_params['scraper_adapter_eq']&.titleize}"
|
|
91
|
+
} %>
|
|
92
|
+
<% active_dropdown_filters.each do |filter_key| %>
|
|
93
|
+
<% clear_query = @search_params.dup %>
|
|
94
|
+
<% clear_query.delete(filter_key) %>
|
|
95
|
+
<% clear_query = if clear_query.respond_to?(:compact_blank)
|
|
96
|
+
clear_query.compact_blank
|
|
97
|
+
else
|
|
98
|
+
clear_query.reject { |_k, v| v.respond_to?(:blank?) ? v.blank? : v.nil? }
|
|
99
|
+
end %>
|
|
100
|
+
<% clear_path = clear_query.empty? ? source_monitor.sources_path : source_monitor.sources_path(q: clear_query) %>
|
|
101
|
+
<span class="inline-flex items-center gap-1 rounded-full bg-blue-100 px-2 py-0.5 text-xs font-medium text-blue-700">
|
|
102
|
+
<%= filter_labels[filter_key] %>
|
|
103
|
+
<%= link_to "×", clear_path, class: "ml-0.5 font-bold text-blue-500 hover:text-blue-700", data: { turbo_frame: "source_monitor_sources_table" } %>
|
|
104
|
+
</span>
|
|
105
|
+
<% end %>
|
|
106
|
+
</div>
|
|
107
|
+
<% end %>
|
|
59
108
|
</div>
|
|
60
109
|
<% end %>
|
|
61
110
|
<table class="min-w-full divide-y divide-slate-200 text-left text-sm">
|
|
@@ -120,6 +169,8 @@
|
|
|
120
169
|
</span>
|
|
121
170
|
</th>
|
|
122
171
|
<th scope="col" class="px-6 py-3">New Items / Day</th>
|
|
172
|
+
<th scope="col" class="px-6 py-3">Avg Feed Words</th>
|
|
173
|
+
<th scope="col" class="px-6 py-3">Avg Scraped Words</th>
|
|
123
174
|
<th scope="col"
|
|
124
175
|
class="px-6 py-3"
|
|
125
176
|
data-sort-column="last_fetched_at"
|
|
@@ -148,12 +199,39 @@
|
|
|
148
199
|
as: :source,
|
|
149
200
|
locals: {
|
|
150
201
|
item_activity_rates: @item_activity_rates,
|
|
202
|
+
avg_feed_word_counts: @avg_feed_word_counts,
|
|
203
|
+
avg_scraped_word_counts: @avg_scraped_word_counts,
|
|
151
204
|
search_params: @search_params
|
|
152
205
|
} %>
|
|
153
206
|
|
|
154
207
|
<%= render("source_monitor/sources/empty_state_row") if @sources.blank? %>
|
|
155
208
|
</tbody>
|
|
156
209
|
</table>
|
|
210
|
+
<div class="flex flex-col items-center gap-3 border-t border-slate-200 px-6 py-4 sm:flex-row sm:justify-between">
|
|
211
|
+
<div class="text-xs text-slate-500">
|
|
212
|
+
Page <%= @page %>
|
|
213
|
+
</div>
|
|
214
|
+
<div class="flex gap-2">
|
|
215
|
+
<% prev_params = { page: @page - 1 } %>
|
|
216
|
+
<% prev_params[:q] = @search_params if @search_params.present? %>
|
|
217
|
+
<% prev_params[:per_page] = params[:per_page] if params[:per_page].present? %>
|
|
218
|
+
<% next_params = { page: @page + 1 } %>
|
|
219
|
+
<% next_params[:q] = @search_params if @search_params.present? %>
|
|
220
|
+
<% next_params[:per_page] = params[:per_page] if params[:per_page].present? %>
|
|
221
|
+
|
|
222
|
+
<% if @has_previous_page %>
|
|
223
|
+
<%= link_to "Previous", source_monitor.sources_path(prev_params), class: "inline-flex items-center rounded-md border border-slate-300 px-3 py-2 text-sm font-medium text-slate-700 hover:bg-slate-50", data: { turbo_frame: "source_monitor_sources_table" } %>
|
|
224
|
+
<% else %>
|
|
225
|
+
<span class="inline-flex items-center rounded-md border border-slate-200 px-3 py-2 text-sm font-medium text-slate-300">Previous</span>
|
|
226
|
+
<% end %>
|
|
227
|
+
|
|
228
|
+
<% if @has_next_page %>
|
|
229
|
+
<%= link_to "Next", source_monitor.sources_path(next_params), class: "inline-flex items-center rounded-md border border-slate-300 px-3 py-2 text-sm font-medium text-slate-700 hover:bg-slate-50", data: { turbo_frame: "source_monitor_sources_table" } %>
|
|
230
|
+
<% else %>
|
|
231
|
+
<span class="inline-flex items-center rounded-md border border-slate-200 px-3 py-2 text-sm font-medium text-slate-300">Next</span>
|
|
232
|
+
<% end %>
|
|
233
|
+
</div>
|
|
234
|
+
</div>
|
|
157
235
|
<% end %>
|
|
158
236
|
</div>
|
|
159
237
|
</div>
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
module SourceMonitor
|
|
4
4
|
class Configuration
|
|
5
5
|
class ScrapingSettings
|
|
6
|
-
attr_accessor :max_in_flight_per_source, :max_bulk_batch_size
|
|
6
|
+
attr_accessor :max_in_flight_per_source, :max_bulk_batch_size, :min_scrape_interval
|
|
7
7
|
|
|
8
8
|
DEFAULT_MAX_IN_FLIGHT = nil
|
|
9
9
|
DEFAULT_MAX_BULK_BATCH_SIZE = 100
|
|
10
|
+
DEFAULT_MIN_SCRAPE_INTERVAL = 1.0
|
|
10
11
|
|
|
11
12
|
def initialize
|
|
12
13
|
reset!
|
|
@@ -15,6 +16,7 @@ module SourceMonitor
|
|
|
15
16
|
def reset!
|
|
16
17
|
@max_in_flight_per_source = DEFAULT_MAX_IN_FLIGHT
|
|
17
18
|
@max_bulk_batch_size = DEFAULT_MAX_BULK_BATCH_SIZE
|
|
19
|
+
@min_scrape_interval = DEFAULT_MIN_SCRAPE_INTERVAL
|
|
18
20
|
end
|
|
19
21
|
|
|
20
22
|
def max_in_flight_per_source=(value)
|
|
@@ -25,6 +27,10 @@ module SourceMonitor
|
|
|
25
27
|
@max_bulk_batch_size = normalize_numeric(value)
|
|
26
28
|
end
|
|
27
29
|
|
|
30
|
+
def min_scrape_interval=(value)
|
|
31
|
+
@min_scrape_interval = normalize_numeric_float(value)
|
|
32
|
+
end
|
|
33
|
+
|
|
28
34
|
private
|
|
29
35
|
|
|
30
36
|
def normalize_numeric(value)
|
|
@@ -34,6 +40,14 @@ module SourceMonitor
|
|
|
34
40
|
integer = value.respond_to?(:to_i) ? value.to_i : value
|
|
35
41
|
integer.positive? ? integer : nil
|
|
36
42
|
end
|
|
43
|
+
|
|
44
|
+
def normalize_numeric_float(value)
|
|
45
|
+
return nil if value.nil?
|
|
46
|
+
return nil if value == ""
|
|
47
|
+
|
|
48
|
+
float = value.respond_to?(:to_f) ? value.to_f : value
|
|
49
|
+
float.positive? ? float : nil
|
|
50
|
+
end
|
|
37
51
|
end
|
|
38
52
|
end
|
|
39
53
|
end
|
|
@@ -14,8 +14,12 @@ module SourceMonitor
|
|
|
14
14
|
status == :already_enqueued
|
|
15
15
|
end
|
|
16
16
|
|
|
17
|
+
def deferred?
|
|
18
|
+
status == :deferred
|
|
19
|
+
end
|
|
20
|
+
|
|
17
21
|
def failure?
|
|
18
|
-
!enqueued? && !already_enqueued?
|
|
22
|
+
!enqueued? && !already_enqueued? && !deferred?
|
|
19
23
|
end
|
|
20
24
|
end
|
|
21
25
|
|
|
@@ -44,6 +48,8 @@ module SourceMonitor
|
|
|
44
48
|
already_queued = false
|
|
45
49
|
rate_limited = false
|
|
46
50
|
rate_limit_info = nil
|
|
51
|
+
time_limited = false
|
|
52
|
+
time_limit_info = nil
|
|
47
53
|
|
|
48
54
|
item.with_lock do
|
|
49
55
|
item.reload
|
|
@@ -61,6 +67,13 @@ module SourceMonitor
|
|
|
61
67
|
next
|
|
62
68
|
end
|
|
63
69
|
|
|
70
|
+
limited, t_info = time_rate_limited?
|
|
71
|
+
if limited
|
|
72
|
+
time_limited = true
|
|
73
|
+
time_limit_info = t_info
|
|
74
|
+
next
|
|
75
|
+
end
|
|
76
|
+
|
|
64
77
|
SourceMonitor::Scraping::State.mark_pending!(item, broadcast: false, lock: false)
|
|
65
78
|
end
|
|
66
79
|
|
|
@@ -75,6 +88,14 @@ module SourceMonitor
|
|
|
75
88
|
return Result.new(status: :rate_limited, message:, item: item)
|
|
76
89
|
end
|
|
77
90
|
|
|
91
|
+
if time_limited
|
|
92
|
+
wait_seconds = time_limit_info[:wait_seconds]
|
|
93
|
+
job_class.set(wait: wait_seconds.seconds).perform_later(item.id)
|
|
94
|
+
message = "Scrape deferred: source was scraped #{time_limit_info[:interval]}s ago, re-enqueued with #{wait_seconds}s delay."
|
|
95
|
+
log("enqueue:deferred", item:, wait_seconds:, interval: time_limit_info[:interval])
|
|
96
|
+
return Result.new(status: :deferred, message:, item: item)
|
|
97
|
+
end
|
|
98
|
+
|
|
78
99
|
job_class.perform_later(item.id)
|
|
79
100
|
log("enqueue:job_enqueued", item:, job_class: job_class.name)
|
|
80
101
|
Result.new(status: :enqueued, message: "Scrape has been enqueued for processing.", item: item)
|
|
@@ -105,6 +126,22 @@ module SourceMonitor
|
|
|
105
126
|
nil
|
|
106
127
|
end
|
|
107
128
|
|
|
129
|
+
def time_rate_limited?
|
|
130
|
+
interval = source.min_scrape_interval || SourceMonitor.config.scraping.min_scrape_interval
|
|
131
|
+
return [ false, nil ] if interval.nil? || interval <= 0
|
|
132
|
+
|
|
133
|
+
last_scrape_at = source.scrape_logs.maximum(:started_at)
|
|
134
|
+
return [ false, nil ] unless last_scrape_at
|
|
135
|
+
|
|
136
|
+
elapsed = Time.current - last_scrape_at
|
|
137
|
+
if elapsed < interval
|
|
138
|
+
wait_seconds = (interval - elapsed).ceil
|
|
139
|
+
[ true, { wait_seconds:, interval:, last_scrape_at: } ]
|
|
140
|
+
else
|
|
141
|
+
[ false, nil ]
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
108
145
|
def rate_limit_exhausted?
|
|
109
146
|
limit = SourceMonitor.config.scraping.max_in_flight_per_source
|
|
110
147
|
return [ false, nil ] unless limit
|
|
@@ -1,6 +1,20 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
namespace :source_monitor do
|
|
4
|
+
desc "Backfill word counts for existing item_content records."
|
|
5
|
+
task backfill_word_counts: :environment do
|
|
6
|
+
total = SourceMonitor::ItemContent.count
|
|
7
|
+
processed = 0
|
|
8
|
+
|
|
9
|
+
SourceMonitor::ItemContent.find_each do |content|
|
|
10
|
+
content.save!
|
|
11
|
+
processed += 1
|
|
12
|
+
puts "Processed #{processed}/#{total} records..." if (processed % 100).zero?
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
puts "Done. Backfilled word counts for #{processed} records."
|
|
16
|
+
end
|
|
17
|
+
|
|
4
18
|
namespace :cleanup do
|
|
5
19
|
desc "Run retention pruning across sources. Accepts SOURCE_IDS and SOFT_DELETE env vars."
|
|
6
20
|
task items: :environment do
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: source_monitor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- dchuk
|
|
@@ -490,6 +490,8 @@ files:
|
|
|
490
490
|
- db/migrate/20251124153000_add_health_fields_to_import_sessions.rb
|
|
491
491
|
- db/migrate/20251125094500_create_import_histories.rb
|
|
492
492
|
- db/migrate/20260210204022_add_composite_index_to_log_entries.rb
|
|
493
|
+
- db/migrate/20260222120000_add_min_scrape_interval_to_sources.rb
|
|
494
|
+
- db/migrate/20260222194201_add_word_counts_to_item_contents.rb
|
|
493
495
|
- docs/configuration.md
|
|
494
496
|
- docs/deployment.md
|
|
495
497
|
- docs/gh-cli-workflow.md
|