newsfetcher 0.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ module NewsFetcher
2
+
3
+ class Fetcher
4
+
5
+ attr_reader :uri
6
+ attr_accessor :timeout
7
+ attr_accessor :max_redirects
8
+ attr_accessor :moved
9
+ attr_accessor :actual_uri
10
+
11
+ include SetParams
12
+
13
+ def self.get(uri)
14
+ new(uri: uri).tap(&:get)
15
+ end
16
+
17
+ def initialize(**params)
18
+ super({
19
+ timeout: 30,
20
+ max_redirects: 5,
21
+ }.merge(params))
22
+ end
23
+
24
+ def uri=(uri)
25
+ @uri = Addressable::URI.parse(uri)
26
+ end
27
+
28
+ def success?
29
+ @response.success?
30
+ end
31
+
32
+ def response_status
33
+ @response&.status
34
+ end
35
+
36
+ def response_reason
37
+ @response&.reason_phrase
38
+ end
39
+
40
+ def content
41
+ @response&.body
42
+ end
43
+
44
+ def parse_feed
45
+ raise Error, "No response yet" unless @response
46
+ raise Error, "Can't parse feed in failed response" unless @response.success?
47
+ Feedjira.configure { |c| c.strip_whitespace = true }
48
+ begin
49
+ feedjira = Feedjira.parse(content)
50
+ rescue Feedjira::NoParserAvailable, Date::Error => e
51
+ raise Error, "Can't parse XML feed from #{@uri}: #{e}"
52
+ end
53
+ {
54
+ title: feedjira.title,
55
+ items: feedjira.entries.map { |e| Item.new(e) },
56
+ }
57
+ end
58
+
59
+ def find_feeds
60
+ raise Error, "No response yet" unless @response
61
+ raise Error, "Can't find feeds in failed response" unless @response.success?
62
+ html = Nokogiri::HTML::Document.parse(content)
63
+ html.xpath('//link[@rel="alternate"]').select { |link| FeedTypes.include?(link['type']) }.map do |link|
64
+ {
65
+ uri: @uri.join(link['href']),
66
+ type: link['type'],
67
+ }
68
+ end
69
+ end
70
+
71
+ def get
72
+ @actual_uri = @uri
73
+ redirects = 0
74
+ while redirects < @max_redirects do
75
+ @response = nil
76
+ connection = Faraday.new(
77
+ url: @actual_uri,
78
+ request: { timeout: @timeout },
79
+ ssl: { verify: false })
80
+ begin
81
+ @response = connection.get
82
+ rescue Faraday::Error => e
83
+ raise Error, "Error: #{e.message} (#{e.class})"
84
+ end
85
+ if (300...400).include?(@response.status)
86
+ @moved = (@response.status == 302)
87
+ location = @response.headers[:location] or raise Error, "No Location header found in redirect"
88
+ @actual_uri = @actual_uri.join(Addressable::URI.parse(location))
89
+ redirects += 1
90
+ next
91
+ end
92
+ return @response.success?
93
+ end
94
+ raise Error, "Too many redirects"
95
+ end
96
+
97
+ end
98
+
99
+ end
@@ -0,0 +1,101 @@
1
+ module NewsFetcher
2
+
3
+ class History
4
+
5
+ attr_accessor :file
6
+ attr_accessor :entries
7
+
8
+ def initialize(file:, index_key: nil)
9
+ @file = Path.new(file)
10
+ @entries = []
11
+ @index = {}
12
+ @index_key = index_key
13
+ if @file.exist?
14
+ load_entries
15
+ else
16
+ @file.touch
17
+ end
18
+ end
19
+
20
+ def load_entries
21
+ @file.readlines.map do |line|
22
+ add_entry(Entry.from_json(line))
23
+ end
24
+ end
25
+
26
+ def <<(entry)
27
+ entry = Entry.new(**entry) if entry.kind_of?(Hash)
28
+ add_entry(entry)
29
+ @file.open('a') { |io| io.puts entry.to_json }
30
+ end
31
+
32
+ def [](key)
33
+ raise "No index_key defined" unless @index_key
34
+ @index[key]
35
+ end
36
+
37
+ def size
38
+ @entries.size
39
+ end
40
+
41
+ def last_entry
42
+ @entries.last
43
+ end
44
+
45
+ def prune(before:)
46
+ old = @entries.select { |e| e.time < before }
47
+ old.each { |e| delete_entry(e) }
48
+ save_entries
49
+ old
50
+ end
51
+
52
+ def reset
53
+ @entries = []
54
+ @index = {}
55
+ @file.unlink
56
+ end
57
+
58
+ private
59
+
60
+ def save_entries
61
+ new_file = @file.add_extension('.new')
62
+ new_file.open('w') do |io|
63
+ @entries.each { |e| io.puts e.to_json }
64
+ end
65
+ @file.unlink
66
+ new_file.rename(@file)
67
+ end
68
+
69
+ def add_entry(entry)
70
+ @entries << entry
71
+ @index[entry[@index_key]] = entry if @index_key
72
+ end
73
+
74
+ def delete_entry(entry)
75
+ @entries.delete(entry)
76
+ @index.delete(entry[@index_key]) if @index_key
77
+ end
78
+
79
+ class Entry < OpenStruct
80
+
81
+ def self.from_json(data)
82
+ json = JSON.parse(data, symbolize_names: true)
83
+ self.parse_json(json)
84
+ new(**json)
85
+ end
86
+
87
+ def self.parse_json(json)
88
+ json[:time] = Time.at(json[:time])
89
+ end
90
+
91
+ def to_json(*opts)
92
+ h = to_h
93
+ h[:time] = h[:time].to_i
94
+ h.to_json(*opts)
95
+ end
96
+
97
+ end
98
+
99
+ end
100
+
101
+ end
@@ -0,0 +1,60 @@
1
+ module NewsFetcher
2
+
3
+ class Item
4
+
5
+ attr_accessor :id
6
+ attr_accessor :date
7
+ attr_accessor :title
8
+ attr_accessor :uri
9
+ attr_accessor :author
10
+ attr_accessor :content
11
+
12
+ include SetParams
13
+ include Simple::Printer::Printable
14
+
15
+ def initialize(entry)
16
+ if entry.url
17
+ begin
18
+ uri = Addressable::URI.parse(entry.url.strip)
19
+ rescue Addressable::URI::InvalidURIError => e
20
+ raise Error, "Can't parse URL for entry: #{entry.url.inspect}"
21
+ end
22
+ else
23
+ uri = nil
24
+ end
25
+ id = entry.entry_id || uri or raise Error, "Can't determine ID or URL for entry"
26
+ super(
27
+ id: id.to_s,
28
+ uri: uri,
29
+ date: entry.published || Time.now,
30
+ title: entry.title,
31
+ author: entry.respond_to?(:author) ? entry.author&.sub(/^by\s+/i, '') : nil,
32
+ content: (entry.content || entry.summary)&.to_s,
33
+ )
34
+ end
35
+
36
+ def printable
37
+ [
38
+ [:id, 'ID'],
39
+ :date,
40
+ :title,
41
+ :uri,
42
+ :author,
43
+ ]
44
+ end
45
+
46
+ def eql?(other)
47
+ @id.eql?(other.id)
48
+ end
49
+
50
+ def ==(other)
51
+ @id == other&.id
52
+ end
53
+
54
+ def age
55
+ Time.now - @date
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,89 @@
1
+ module NewsFetcher
2
+
3
+ class Profile
4
+
5
+ attr_reader :dir
6
+ attr_accessor :config
7
+
8
+ include SetParams
9
+
10
+ def initialize(params={})
11
+ super
12
+ setup_logger
13
+ setup_styles
14
+ end
15
+
16
+ def setup_logger
17
+ $logger = Logger.new(STDERR,
18
+ level: @config.log_level,
19
+ formatter: proc { |severity, timestamp, progname, msg|
20
+ "%s %5s: %s\n" % [timestamp.strftime('%FT%T%:z'), severity, msg]
21
+ },
22
+ )
23
+ end
24
+
25
+ def setup_styles
26
+ raise Error, "dir not set" unless @dir
27
+ @styles = [@config.main_stylesheet, @config.aux_stylesheets].flatten.compact.map do |file|
28
+ file = Path.new(file)
29
+ file = @dir / file if file.relative?
30
+ SassC::Engine.new(file.read, syntax: :scss, style: :compressed).render
31
+ end
32
+ end
33
+
34
+ def dir=(dir)
35
+ @dir = Path.new(dir).expand_path
36
+ end
37
+
38
+ def id
39
+ @dir.basename.to_s
40
+ end
41
+
42
+ def config_file
43
+ @dir / ConfigFileName
44
+ end
45
+
46
+ def subscriptions_dir
47
+ @dir / SubscriptionsDirName
48
+ end
49
+
50
+ def all_ids
51
+ subscriptions_dir.glob("**/#{ConfigFileName}").map { |p| p.dirname.relative_to(subscriptions_dir).to_s }
52
+ end
53
+
54
+ def find_subscriptions(ids: nil, status: nil, sort: nil)
55
+ status = [status].flatten.compact if status
56
+ sort ||= :id
57
+ ids = all_ids if ids.nil? || ids.empty?
58
+ subscriptions = ids.map do |id|
59
+ subscription_dir = subscriptions_dir / id
60
+ begin
61
+ subscription_config = @config.load(subscription_dir / ConfigFileName)
62
+ rescue Config::Error => e
63
+ raise "#{id}: #{e}"
64
+ end
65
+ Subscription.new(id: id, dir: subscription_dir, config: subscription_config, styles: @styles)
66
+ end
67
+ subscriptions.
68
+ select { |s| status.nil? || status.include?(s.status) }.
69
+ sort_by { |s| s.send(sort).to_s }
70
+ end
71
+
72
+ def add_subscription(uri:, id: nil, path: nil)
73
+ uri = Addressable::URI.parse(uri)
74
+ raise Error, "Bad URI: #{uri}" unless uri.absolute?
75
+ id ||= uri.make_subscription_id
76
+ id = "#{path}/#{id}" if path
77
+ subscription_dir = subscriptions_dir / id
78
+ raise Error, "Subscription already exists in #{subscription_dir}" if subscription_dir.exist?
79
+ subscription_dir.mkpath
80
+ config = @config.make(uri: uri)
81
+ config_file = subscription_dir / ConfigFileName
82
+ config.save(config_file)
83
+ $logger.info { "Saved new subscription to #{config_file}" }
84
+ subscription_dir
85
+ end
86
+
87
+ end
88
+
89
+ end
@@ -0,0 +1,70 @@
1
+ module NewsFetcher
2
+
3
+ module Scrubber
4
+
5
+ def self.scrub_html(html)
6
+ Loofah.fragment(html).
7
+ scrub!(:prune).
8
+ scrub!(RemoveExtras).
9
+ scrub!(RemoveVoxFooter).
10
+ scrub!(RemoveStyling).
11
+ scrub!(ReplaceBlockquote).
12
+ to_html
13
+ end
14
+
15
+ def self.text_to_html(text)
16
+ Simple::Builder.build_html do |html|
17
+ text.split("\n").each_with_index do |line, i|
18
+ html.br unless i == 0
19
+ html.text(line)
20
+ end
21
+ end.to_html
22
+ end
23
+
24
+ RemoveVoxFooter = Loofah::Scrubber.new do |node|
25
+ if node.text == 'Help keep Vox free for all'
26
+ n = node
27
+ while (n = n.previous)
28
+ if n.name == 'hr'
29
+ n.remove
30
+ break
31
+ end
32
+ end
33
+ while node.next
34
+ node.next.remove
35
+ end
36
+ node.remove
37
+ Loofah::Scrubber::STOP
38
+ end
39
+ end
40
+
41
+ RemoveExtras = Loofah::Scrubber.new do |node|
42
+ if node.name == 'div' && node['class'] == 'feedflare'
43
+ node.remove
44
+ elsif node.name == 'img' && node['height'] == '1' && node['width'] == '1'
45
+ node.remove
46
+ elsif node.name == 'form'
47
+ node.replace(node.children)
48
+ end
49
+ end
50
+
51
+ RemoveStyling = Loofah::Scrubber.new do |node|
52
+ if %w{font big small}.include?(node.name)
53
+ node.replace(node.children)
54
+ else
55
+ node.remove_attribute('style') if node['style']
56
+ node.remove_attribute('class') if node['class']
57
+ node.remove_attribute('id') if node['id']
58
+ end
59
+ end
60
+
61
+ ReplaceBlockquote = Loofah::Scrubber.new do |node|
62
+ if node.name == 'blockquote'
63
+ node.name = 'div'
64
+ node['class'] = 'blockquote'
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,274 @@
1
+ module NewsFetcher
2
+
3
+ class Subscription
4
+
5
+ attr_accessor :id
6
+ attr_accessor :dir
7
+ attr_accessor :config
8
+ attr_accessor :styles
9
+ attr_accessor :items
10
+ attr_accessor :title
11
+
12
+ include SetParams
13
+ include Simple::Printer::Printable
14
+
15
+ def initialize(**params)
16
+ @title = nil
17
+ @items = []
18
+ super
19
+ @item_history = History.new(file: item_history_file, index_key: :id)
20
+ @response_history = History.new(file: response_history_file)
21
+ end
22
+
23
+ def inspect
24
+ to_s
25
+ end
26
+
27
+ def printable
28
+ [
29
+ [:id, 'ID'],
30
+ { label: 'URI', value: @config.uri },
31
+ :dir,
32
+ :title,
33
+ :status,
34
+ { label: 'Age', value: (a = age) ? '%d days' % (a / DaySecs) : 'never' },
35
+ { label: 'Disabled', value: @config.disabled },
36
+ { label: 'Last response', value: format_response_history_entry(@response_history.last_entry) },
37
+ :items,
38
+ ]
39
+ end
40
+
41
+ def format_response_history_entry(entry)
42
+ if entry
43
+ '%s (%s) at %s' % [entry.status, entry.reason, entry.time]
44
+ else
45
+ 'none'
46
+ end
47
+ end
48
+
49
+ def config_file
50
+ raise Error, "dir not set" unless @dir
51
+ @dir / ConfigFileName
52
+ end
53
+
54
+ def item_history_file
55
+ raise Error, "dir not set" unless @dir
56
+ @dir / ItemHistoryFileName
57
+ end
58
+
59
+ def response_history_file
60
+ raise Error, "dir not set" unless @dir
61
+ @dir / ResponseHistoryFileName
62
+ end
63
+
64
+ def age
65
+ entry = @item_history.last_entry or return nil
66
+ Time.now - entry.time
67
+ end
68
+
69
+ def status
70
+ if (a = age)
71
+ if a > @config.max_age
72
+ :dormant
73
+ else
74
+ :active
75
+ end
76
+ else
77
+ :new
78
+ end
79
+ end
80
+
81
+ def disabled?
82
+ !!@config.disabled
83
+ end
84
+
85
+ def make_dotted_folder
86
+ components = @id.split('/')
87
+ components.pop if @config.consolidate && components.length > 1
88
+ components.unshift(@config.root_folder) if @config.root_folder
89
+ components.join('.')
90
+ end
91
+
92
+ def update_item_history
93
+ @items.each do |item|
94
+ @item_history << { time: item.date, id: item.id }
95
+ end
96
+ end
97
+
98
+ def prune_item_history
99
+ @item_history.prune(before: Time.now - @config.max_age).each do |entry|
100
+ $logger.info { "pruned #{entry.id.inspect} (#{entry.time})"}
101
+ end
102
+ end
103
+
104
+ def prune_response_history
105
+ @response_history.prune(before: Time.now - @config.max_age).each do |entry|
106
+ $logger.info { "pruned response from #{entry.time}" }
107
+ end
108
+ end
109
+
110
+ def update
111
+ $logger.debug { "#{@id}: updating" }
112
+ begin
113
+ prune_item_history
114
+ prune_response_history
115
+ if recently_updated?
116
+ $logger.info { "#{@id}: too soon to update" }
117
+ return
118
+ end
119
+ get
120
+ reject_items
121
+ update_item_history
122
+ deliver
123
+ rescue Error => e
124
+ $logger.error { "#{@id}: #{e}" }
125
+ end
126
+ end
127
+
128
+ def recently_updated?
129
+ (entry = @response_history.last_entry) &&
130
+ # (200...400).include?(entry.status) &&
131
+ Time.now - entry.time < @config.update_interval
132
+ end
133
+
134
+ def get
135
+ fetcher = Fetcher.get(@config.uri)
136
+ @response_history << {
137
+ time: Time.now,
138
+ status: fetcher.response_status,
139
+ reason: fetcher.response_reason,
140
+ }
141
+ if fetcher.success?
142
+ if fetcher.moved && !@config.ignore_moved
143
+ $logger.warn { "#{@id}: URI #{@config.uri} moved to #{fetcher.actual_uri}" }
144
+ end
145
+ feed = fetcher.parse_feed
146
+ @title = @config.title || feed[:title]
147
+ @items = feed[:items]
148
+ else
149
+ $logger.warn { "#{@id}: HTTP error #{fetcher.response_status} (#{fetcher.response_reason})" }
150
+ end
151
+ end
152
+
153
+ def reject_items
154
+ @items.reject! do |item|
155
+ if (reason = reject_item?(item))
156
+ $logger.debug { "#{@id}: removing item: #{reason} #{item.id}" }
157
+ true
158
+ end
159
+ end
160
+ end
161
+
162
+ def reject_item?(item)
163
+ if item.age > @config.max_age
164
+ 'outdated item'
165
+ elsif @item_history[item.id]
166
+ 'seen item'
167
+ elsif @config.ignore_uris.find { |r| item.uri.to_s =~ r }
168
+ 'ignored item'
169
+ end
170
+ end
171
+
172
+ def deliver
173
+ $logger.debug { "#{@id}: no items to deliver" } if @items.empty?
174
+ @items.sort_by(&:date).each do |item|
175
+ deliver_item(item)
176
+ end
177
+ end
178
+
179
+ def reset
180
+ @item_history.reset
181
+ end
182
+
183
+ def enable
184
+ @config.disabled = false
185
+ @config.save(config_file)
186
+ end
187
+
188
+ def disable
189
+ @config.disabled = true
190
+ @config.save(config_file)
191
+ end
192
+
193
+ def fix
194
+ end
195
+
196
+ def deliver_item(item)
197
+ folder = make_dotted_folder
198
+ fields = {
199
+ subscription_id: @id,
200
+ item_title: item.title,
201
+ subscription_folder: folder,
202
+ }
203
+ mail = Mail.new
204
+ mail.date = item.date
205
+ mail.from = ERB.new(@config.mail_from).result_with_hash(fields)
206
+ mail.to = ERB.new(@config.mail_to).result_with_hash(fields)
207
+ mail.subject = ERB.new(@config.mail_subject).result_with_hash(fields)
208
+ mail.content_type = 'text/html'
209
+ mail.charset = 'utf-8'
210
+ mail.body = build_item_html(item)
211
+ delivery_method = @config.delivery_method&.to_sym
212
+ delivery_params = @config.delivery_params
213
+ $logger.info {
214
+ "#{@id}: Sending item to %s in folder %s via %s: %p" % [
215
+ mail.to.join(', '),
216
+ folder,
217
+ delivery_method || '<default>',
218
+ mail.subject,
219
+ ]
220
+ }
221
+ if delivery_method == :maildir
222
+ delivery_method = Mail::Maildir
223
+ delivery_dir = Path.new(delivery_params[:dir]) / ".#{folder}"
224
+ delivery_params = delivery_params.merge(dir: delivery_dir.to_s)
225
+ end
226
+ mail.delivery_method(delivery_method, **delivery_params) if delivery_method
227
+ mail.deliver!
228
+ end
229
+
230
+ def build_item_html(item)
231
+ Simple::Builder.build_html4_document do |html|
232
+ html.html do
233
+ html.head do
234
+ html.meta(name: 'x-apple-disable-message-reformatting')
235
+ html.meta(name: 'viewport', content: 'width=device-width, initial-scale=1')
236
+ @styles.each do |style|
237
+ html.style { html << style }
238
+ end
239
+ end
240
+ html.body do
241
+ html.div(class: 'header') do
242
+ html << ('%s [%s]' % [@title, @id]).to_html
243
+ end
244
+ if item.title
245
+ html.h1 do
246
+ html << item.title.to_html
247
+ end
248
+ end
249
+ html.h2 do
250
+ html << [
251
+ item.date.strftime('%e %B %Y'),
252
+ item.author,
253
+ ].compact.join(' • ').to_html
254
+ end
255
+ if item.uri
256
+ html.h3 do
257
+ html.a(item.uri.prettify, href: item.uri)
258
+ end
259
+ end
260
+ if item.content
261
+ if item.content.html?
262
+ html << Scrubber.scrub_html(item.content)
263
+ else
264
+ html << Scrubber.text_to_html(item.content)
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end.to_html
270
+ end
271
+
272
+ end
273
+
274
+ end
@@ -0,0 +1,5 @@
1
+ module NewsFetcher
2
+
3
+ VERSION = '0.84'
4
+
5
+ end