newsfetcher 0.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/LICENSE.txt +21 -0
- data/README.md +123 -0
- data/Rakefile +1 -0
- data/TODO.md +67 -0
- data/bin/newsfetcher +10 -0
- data/lib/newsfetcher/command.rb +20 -0
- data/lib/newsfetcher/commands/add.rb +19 -0
- data/lib/newsfetcher/commands/dir.rb +18 -0
- data/lib/newsfetcher/commands/disable.rb +18 -0
- data/lib/newsfetcher/commands/discover.rb +28 -0
- data/lib/newsfetcher/commands/edit.rb +19 -0
- data/lib/newsfetcher/commands/enable.rb +18 -0
- data/lib/newsfetcher/commands/fix.rb +18 -0
- data/lib/newsfetcher/commands/get.rb +31 -0
- data/lib/newsfetcher/commands/init.rb +28 -0
- data/lib/newsfetcher/commands/remove.rb +19 -0
- data/lib/newsfetcher/commands/reset.rb +16 -0
- data/lib/newsfetcher/commands/show.rb +26 -0
- data/lib/newsfetcher/commands/update.rb +39 -0
- data/lib/newsfetcher/error.rb +6 -0
- data/lib/newsfetcher/extensions/addressable-uri.rb +24 -0
- data/lib/newsfetcher/extensions/mail.rb +22 -0
- data/lib/newsfetcher/extensions/string.rb +15 -0
- data/lib/newsfetcher/fetcher.rb +99 -0
- data/lib/newsfetcher/history.rb +101 -0
- data/lib/newsfetcher/item.rb +60 -0
- data/lib/newsfetcher/profile.rb +89 -0
- data/lib/newsfetcher/scrubber.rb +70 -0
- data/lib/newsfetcher/subscription.rb +274 -0
- data/lib/newsfetcher/version.rb +5 -0
- data/lib/newsfetcher.rb +86 -0
- data/message/stylesheet.css +48 -0
- data/newsfetcher.gemspec +42 -0
- data/test/history_test.rb +60 -0
- data/test/main_test.rb +66 -0
- metadata +374 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bb18377fe7a1b55c9a1f9778b6824294ab8c65cad6f8260d388b61629247de48
|
4
|
+
data.tar.gz: a0648cec658c5299075ac08a3e5524a766d14aac20f0b8efd37603a607e4be93
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 636aad38f91c50934a524f7862143bc0b31b7b544b5a2f82686dcf51ce6ac3a91d361e407fb55190b7c13465e396c241bfc550b1f660e13176644c53eb4f8f3f
|
7
|
+
data.tar.gz: 3b5710a57847e4653c822b57242437df7d21576bcedc39d5ba1eb7b91df605e9e6590ff23434fb4121dc1f904ba155085c24e78bf2142a20f6a54387260e5300
|
data/.gitignore
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2025 John Labovitz
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# NewsFetcher
|
2
|
+
|
3
|
+
NewsFetcher monitors RSS and Atom feeds and delivers new items as individual HTML emails. It tracks which feed items have already been delivered, so you only receive new content.
|
4
|
+
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- Monitors multiple RSS/Atom feeds organized into subscriptions
|
9
|
+
- Delivers each new feed item as a separate HTML email with custom styling
|
10
|
+
- Tracks item history to avoid re-delivering the same content
|
11
|
+
- Supports multiple delivery methods (SMTP, Maildir, etc.)
|
12
|
+
- Configurable update intervals and age limits per subscription
|
13
|
+
- Multi-threaded updating for efficiency
|
14
|
+
- Feed discovery from web pages
|
15
|
+
- Per-subscription and global configuration
|
16
|
+
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
```
|
21
|
+
gem install newsfetcher
|
22
|
+
```
|
23
|
+
|
24
|
+
|
25
|
+
## Quick Start
|
26
|
+
|
27
|
+
Initialize a profile with your email settings:
|
28
|
+
|
29
|
+
```
|
30
|
+
newsfetcher init --mail-from feeds@example.com --mail-to you@example.com
|
31
|
+
```
|
32
|
+
|
33
|
+
Discover a feed for a website:
|
34
|
+
|
35
|
+
```
|
36
|
+
newsfetcher discover https://example.com/
|
37
|
+
```
|
38
|
+
|
39
|
+
Add a feed subscription:
|
40
|
+
|
41
|
+
```
|
42
|
+
newsfetcher add https://example.com/feed.xml
|
43
|
+
```
|
44
|
+
|
45
|
+
Update all subscriptions and deliver new items:
|
46
|
+
|
47
|
+
```
|
48
|
+
newsfetcher update
|
49
|
+
```
|
50
|
+
|
51
|
+
|
52
|
+
## Usage
|
53
|
+
|
54
|
+
NewsFetcher organizes feeds into subscriptions stored in `~/.newsfetcher` by default. Each subscription has its own configuration and tracks which items have been delivered.
|
55
|
+
|
56
|
+
### Commands
|
57
|
+
|
58
|
+
- `init` — Initialize a new profile with email settings
|
59
|
+
- `add URI [PATH] [ID]` — Add a new feed subscription
|
60
|
+
- `update [IDs...]` — Update subscriptions and deliver new items
|
61
|
+
- `show [IDs...]` — Display subscription details
|
62
|
+
- `enable ID` — Enable a disabled subscription
|
63
|
+
- `disable ID` — Disable a subscription
|
64
|
+
- `remove ID` — Remove a subscription
|
65
|
+
- `reset ID` — Clear item history (next update will re-deliver all items)
|
66
|
+
- `edit ID` — Edit subscription configuration
|
67
|
+
- `dir` — Show the profile directory path
|
68
|
+
- `discover URI` — Find feeds in a web page
|
69
|
+
- `get URI` — Fetch and display a feed without saving
|
70
|
+
- `uniq` — Remove duplicate items from history
|
71
|
+
- `fix` — Fix subscription issues
|
72
|
+
|
73
|
+
### Configuration
|
74
|
+
|
75
|
+
Configuration lives in JSON files at the profile and subscription levels. Profile-level settings in `~/.newsfetcher/config.json` serve as defaults for all subscriptions.
|
76
|
+
|
77
|
+
Key configuration options:
|
78
|
+
|
79
|
+
- `mail_from`, `mail_to` — Email sender and recipient
|
80
|
+
- `mail_subject` — Subject line template (supports ERB)
|
81
|
+
- `delivery_method` — How to deliver mail (`:smtp`, `:maildir`, etc.)
|
82
|
+
- `delivery_params` — Parameters for the delivery method
|
83
|
+
- `update_interval` — Minimum time between updates (default: 1 hour)
|
84
|
+
- `max_age` — How long to track items (default: 30 days)
|
85
|
+
- `max_threads` — Number of concurrent subscription updates (default: 100)
|
86
|
+
- `disabled` — Whether to skip this subscription during updates
|
87
|
+
- `ignore_uris` — Array of regex patterns to ignore items
|
88
|
+
- `root_folder` — Prefix for maildir folders or mail subject
|
89
|
+
- `consolidate` — Use shorter folder names
|
90
|
+
|
91
|
+
Subscription-specific settings override profile defaults.
|
92
|
+
|
93
|
+
### Styling
|
94
|
+
|
95
|
+
Email messages use HTML with embedded CSS compiled from SCSS. The default stylesheet is included, but you can specify additional stylesheets:
|
96
|
+
|
97
|
+
```json
|
98
|
+
{
|
99
|
+
"aux_stylesheets": ["~/.newsfetcher/custom.scss"]
|
100
|
+
}
|
101
|
+
```
|
102
|
+
|
103
|
+
|
104
|
+
## Automation
|
105
|
+
|
106
|
+
Run newsfetcher update periodically using cron, launchd, or your preferred scheduler to automatically check feeds and deliver new items.
|
107
|
+
|
108
|
+
|
109
|
+
## How It Works
|
110
|
+
|
111
|
+
NewsFetcher fetches each feed, parses it with Feedjira, and compares items against the subscription’s history. New items (not previously seen and within the configured age limit) are formatted as HTML emails and delivered using Ruby’s Mail gem.
|
112
|
+
|
113
|
+
Each subscription maintains two history files: one tracking delivered items, one tracking HTTP responses. These histories are automatically pruned based on the `max_age` setting.
|
114
|
+
|
115
|
+
|
116
|
+
## Requirements
|
117
|
+
|
118
|
+
Ruby 2.7 or later.
|
119
|
+
|
120
|
+
|
121
|
+
## License
|
122
|
+
|
123
|
+
MIT
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'simple-rake-tasks'
|
data/TODO.md
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# TODO
|
2
|
+
|
3
|
+
## BUGS
|
4
|
+
|
5
|
+
|
6
|
+
## IMPROVEMENTS
|
7
|
+
|
8
|
+
- add 'audit' command
|
9
|
+
- interactively modify subscriptions
|
10
|
+
- show activity, dormancy, errors
|
11
|
+
- enable/disable, delete, rename
|
12
|
+
|
13
|
+
- keep original web address in config
|
14
|
+
- to solve problems when feed disappears
|
15
|
+
|
16
|
+
- implement retry on error in fetching/parsing
|
17
|
+
- eg, to handle timeouts, connection/read errors, bad data, etc.
|
18
|
+
- only show error if > retry count
|
19
|
+
|
20
|
+
- add new Feed class to encapsulate title/items
|
21
|
+
|
22
|
+
- save last fetch response to file (Marshaled?)
|
23
|
+
|
24
|
+
- rename 'path' to 'section'?
|
25
|
+
|
26
|
+
- add per-subscription locks to avoid access by multiple processes/threads
|
27
|
+
|
28
|
+
- convert HTML document builder to ERB, for easier customization
|
29
|
+
|
30
|
+
|
31
|
+
## FEATURES
|
32
|
+
|
33
|
+
- allow unit suffixes on durations (s, m, h, d, w)
|
34
|
+
|
35
|
+
- allow 'add' to take 'id' option to customize ID
|
36
|
+
|
37
|
+
- add 'remove' feature to remove HTML element
|
38
|
+
- specifies XPath expression to remove
|
39
|
+
|
40
|
+
- auto-discover on 'add'
|
41
|
+
- or auto-add on discover?
|
42
|
+
|
43
|
+
- allow update by section (eg, world)
|
44
|
+
|
45
|
+
- add 'config' command
|
46
|
+
- '--root' specifies root config
|
47
|
+
- use readline to show/edit values
|
48
|
+
|
49
|
+
- expand 'ignore' feature
|
50
|
+
- each rule can match on any fields
|
51
|
+
ignore:
|
52
|
+
uri: /foo
|
53
|
+
title: Bar
|
54
|
+
|
55
|
+
- add check/validate command
|
56
|
+
- fetch HTML page for feed
|
57
|
+
- verify that feed matches <link> element
|
58
|
+
|
59
|
+
|
60
|
+
## RELEASE
|
61
|
+
|
62
|
+
- write README documentation
|
63
|
+
- bump version to 1.0
|
64
|
+
- reset git history
|
65
|
+
- push to Github
|
66
|
+
- release to Rubygems
|
67
|
+
- release publicly
|
data/bin/newsfetcher
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
class Command < Simple::CommandParser::Command
|
4
|
+
|
5
|
+
attr_accessor :dir
|
6
|
+
attr_accessor :log_level
|
7
|
+
attr_accessor :max_threads
|
8
|
+
|
9
|
+
def run(args)
|
10
|
+
dir = Path.new(@dir || DefaultProfileDir)
|
11
|
+
config_file = dir / ConfigFileName
|
12
|
+
config = config_file.exist? ? BaseConfig.load(config_file) : BaseConfig.make
|
13
|
+
config.log_level = @log_level.downcase.to_sym if @log_level
|
14
|
+
config.max_threads = @max_threads&.to_i if @max_threads
|
15
|
+
@profile = Profile.new(dir: dir, config: config) or raise Error, "Profile not loaded"
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Add < Command
|
6
|
+
|
7
|
+
def run(args)
|
8
|
+
super
|
9
|
+
uri, path, id = *args
|
10
|
+
raise Error, "No URI specified" unless uri
|
11
|
+
dir = @profile.add_subscription(uri: uri, id: id, path: path)
|
12
|
+
warn "Added subscription: #{dir}"
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Discover < Command
|
6
|
+
|
7
|
+
def run(args)
|
8
|
+
super
|
9
|
+
args.map { |a| Addressable::URI.parse(a) }.each do |uri|
|
10
|
+
fetcher = Fetcher.get(uri)
|
11
|
+
if fetcher.success?
|
12
|
+
fetcher.find_feeds.each do |feed|
|
13
|
+
feed.each do |key, value|
|
14
|
+
puts "%10s: %s" % [key, value]
|
15
|
+
end
|
16
|
+
puts
|
17
|
+
end
|
18
|
+
else
|
19
|
+
warn "#{uri}: HTTP error #{fetcher.response_status} (#{fetcher.response_reason})"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Edit < Command
|
6
|
+
|
7
|
+
def run(args)
|
8
|
+
super
|
9
|
+
editor = ENV['EDITOR'] or raise Error, "No editor defined in $EDITOR"
|
10
|
+
@profile.find_subscriptions(ids: args).each do |subscription|
|
11
|
+
system(editor, (subscription.dir / ConfigFileName).to_s)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Get < Command
|
6
|
+
|
7
|
+
def run(args)
|
8
|
+
super
|
9
|
+
args.each do |uri|
|
10
|
+
fetcher = Fetcher.get(uri)
|
11
|
+
if fetcher.success?
|
12
|
+
feed = fetcher.parse_feed
|
13
|
+
puts
|
14
|
+
puts "URI: #{uri}"
|
15
|
+
puts "Title: #{feed[:title]}"
|
16
|
+
puts "Items:"
|
17
|
+
feed[:items].each do |item|
|
18
|
+
item.print
|
19
|
+
puts
|
20
|
+
end
|
21
|
+
else
|
22
|
+
warn "#{uri}: HTTP error #{fetcher.response_status} (#{fetcher.response_reason})"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Init < Command
|
6
|
+
|
7
|
+
attr_accessor :mail_from
|
8
|
+
attr_accessor :mail_to
|
9
|
+
|
10
|
+
def run(args)
|
11
|
+
super
|
12
|
+
raise Error, "Must specify mail_from" unless @mail_from
|
13
|
+
raise Error, "Must specify mail_to" unless @mail_to
|
14
|
+
@profile = Profile.new(
|
15
|
+
dir: @dir,
|
16
|
+
config: BaseConfig.make(
|
17
|
+
mail_from: @mail_from,
|
18
|
+
mail_to: @mail_to,
|
19
|
+
),
|
20
|
+
)
|
21
|
+
profile.save
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Remove < Command
|
6
|
+
|
7
|
+
def run(args)
|
8
|
+
super
|
9
|
+
@profile.find_subscriptions(ids: args).each do |subscription|
|
10
|
+
dir = subscription.dir or raise Error, "dir not set"
|
11
|
+
dir.rmtree
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Show < Command
|
6
|
+
|
7
|
+
attr_accessor :status
|
8
|
+
attr_accessor :sort
|
9
|
+
|
10
|
+
def run(args)
|
11
|
+
super
|
12
|
+
@profile.find_subscriptions(
|
13
|
+
ids: args,
|
14
|
+
status: @status&.split(',')&.map(&:to_sym),
|
15
|
+
sort: @sort&.to_sym,
|
16
|
+
).each do |subscription|
|
17
|
+
subscription.print
|
18
|
+
puts
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module NewsFetcher
|
2
|
+
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
class Update < Command
|
6
|
+
|
7
|
+
def run(args)
|
8
|
+
super
|
9
|
+
subscriptions = @profile.find_subscriptions(ids: args).reject(&:disabled?)
|
10
|
+
if @profile.config.max_threads == 0
|
11
|
+
subscriptions.each(&:update)
|
12
|
+
else
|
13
|
+
run_threads(subscriptions, &:update)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def run_threads(objects, &block)
|
18
|
+
threads = []
|
19
|
+
objects.each do |object|
|
20
|
+
if threads.length >= @profile.config.max_threads
|
21
|
+
$logger.debug { "Waiting for #{threads.length} threads to finish" }
|
22
|
+
threads.map(&:join)
|
23
|
+
threads = []
|
24
|
+
end
|
25
|
+
threads << Thread.new do
|
26
|
+
$logger.debug { "Started thread for #{object.id}" }
|
27
|
+
yield(object)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
unless threads.empty?
|
31
|
+
$logger.debug { "Waiting for last #{threads.length} threads to finish" }
|
32
|
+
threads.map(&:join)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Addressable
|
2
|
+
|
3
|
+
class URI
|
4
|
+
|
5
|
+
def make_subscription_id
|
6
|
+
[
|
7
|
+
host.to_s \
|
8
|
+
.sub(/^(www|ssl|en|feeds|rss|blogs?|news).*?\./i, '') \
|
9
|
+
.sub(/\.(com|org|net|info|edu|co\.uk|wordpress\.com|blogspot\.com|feedburner\.com)$/i, ''),
|
10
|
+
path.to_s \
|
11
|
+
.gsub(/\b(\.?feeds?|index|atom|rss|rss2|xml|rdf|php|blog|posts|default)\b/i, ''),
|
12
|
+
query.to_s \
|
13
|
+
.gsub(/\b(format|feed|type|q)=(atom|rss\.xml|rss2?|xml)/i, ''),
|
14
|
+
] \
|
15
|
+
.join(' ')
|
16
|
+
.downcase
|
17
|
+
.gsub(/[^a-z0-9]+/, ' ') # non-alphanumeric
|
18
|
+
.strip
|
19
|
+
.gsub(/\s+/, '-')
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'maildir'
|
2
|
+
|
3
|
+
module Mail
|
4
|
+
|
5
|
+
class Maildir
|
6
|
+
|
7
|
+
attr_accessor :settings
|
8
|
+
|
9
|
+
def initialize(values)
|
10
|
+
self.settings = values
|
11
|
+
end
|
12
|
+
|
13
|
+
def deliver!(mail)
|
14
|
+
dir = settings[:dir] or raise Error, "'dir' not found in settings"
|
15
|
+
maildir = ::Maildir.new(dir)
|
16
|
+
maildir.serializer = ::Maildir::Serializer::Mail.new
|
17
|
+
maildir.add(mail)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|