pluto-update 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d631c3eb45ac43efb9e2d0186270a5ff6adc0e43
4
- data.tar.gz: bb4774dcb0d97bc05b3f19f05956f608d94bf18d
3
+ metadata.gz: 49adc91fcbc3b7df6d0ee5aed048fa0df4abb186
4
+ data.tar.gz: 16b22d7a9993c287f592176acd9991ac5d42e3c5
5
5
  SHA512:
6
- metadata.gz: 858c5e290c621b417ab5773e3c307daed4a0c0174d00ef88729ac9a235deafc5e0c42027eaaf86220017415b9538be04c6661a7f989fedd5c87c5143c97e2fe9
7
- data.tar.gz: 228ffdf0626f7455c73bc108bdc4639c5f2ff42d468238d2d6dcd308a8e0df3aead6f0f2cecf0d374a36b378b3babba0353ad10f26d5e8ca81ef4d102117302b
6
+ metadata.gz: c0db8f9b7dcb72009df75b4cc001fe522e579f4b55bc8292b7ab5d19f4166aead75084b0da9356444dddd00b999a372e9972abaedf9230497706db138d766f80
7
+ data.tar.gz: 0fc2ba83226d96ca4b9a71829079c7232322997963230a99cda0948ae4591dc4071725ddc287a2485dcdb083ec8fc1015c0399a50238ff78901f0dd39d908d00
File without changes
@@ -3,8 +3,12 @@ Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
5
  lib/pluto/update.rb
6
- lib/pluto/update/fetcher.rb
7
- lib/pluto/update/refresher.rb
8
- lib/pluto/update/subscriber.rb
9
- lib/pluto/update/updater.rb
6
+ lib/pluto/update/feed_refresher.rb
7
+ lib/pluto/update/site_fetcher.rb
8
+ lib/pluto/update/site_refresher.rb
9
+ lib/pluto/update/site_updater.rb
10
10
  lib/pluto/update/version.rb
11
+ test/data/ruby.ini
12
+ test/helper.rb
13
+ test/test_refresh.rb
14
+ test/test_site.rb
data/README.md CHANGED
@@ -17,10 +17,10 @@
17
17
  ```
18
18
  title = Planet Ruby
19
19
 
20
- [rubyflow]
21
- title = Ruby Flow
22
- link = http://rubyflow.com
23
- feed = http://feeds.feedburner.com/Rubyflow?format=xml
20
+ [rubylang]
21
+ title = Ruby Lang News
22
+ link = http://www.ruby-lang.org/en/news
23
+ feed = http://www.ruby-lang.org/en/feeds/news.rss
24
24
 
25
25
  [rubyonrails]
26
26
  title = Ruby on Rails Blog
data/Rakefile CHANGED
@@ -18,9 +18,10 @@ Hoe.spec 'pluto-update' do
18
18
  self.history_file = 'HISTORY.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['pluto-models', '>= 1.3.2'],
22
- ['fetcher', '>= 0.4.4'],
23
- ['preproc', '>= 0.1.0'],
21
+ ['pluto-models', '>= 1.3.2'],
22
+ ['pluto-feedfetcher', '>= 0.1.0'],
23
+ ['fetcher', '>= 0.4.4'],
24
+ ['preproc', '>= 0.1.0'],
24
25
  ]
25
26
 
26
27
  self.licenses = ['Public Domain']
@@ -2,6 +2,7 @@
2
2
 
3
3
 
4
4
  require 'pluto/models'
5
+ require 'pluto/feedfetcher'
5
6
 
6
7
 
7
8
  # more 3rd party gems
@@ -11,26 +12,30 @@ require 'preproc' # include preprocessor
11
12
 
12
13
  # our own code
13
14
  require 'pluto/update/version' # Note: let version always go first
14
- require 'pluto/update/fetcher'
15
- require 'pluto/update/refresher'
16
- require 'pluto/update/subscriber'
17
- require 'pluto/update/updater'
15
+ require 'pluto/update/feed_refresher'
16
+ require 'pluto/update/site_refresher'
17
+ require 'pluto/update/site_fetcher'
18
+ require 'pluto/update/site_updater'
18
19
 
19
20
 
20
21
 
21
22
  module Pluto
22
23
 
23
- # todo: add alias update_site( config ) ??
24
- def self.update_subscriptions( config )
25
- Subscriber.new.update_subscriptions( config )
24
+ def self.refresh_feeds ## refresh == fetch+parse+update
25
+ FeedRefresher.new.refresh_feeds
26
26
  end
27
27
 
28
+ def self.refresh_sites ## refresh == fetch+parse+update
29
+ SiteRefresher.new.refresh_sites
30
+ end
31
+
32
+ ### convenience alias w/ update_ -- use refresh (only) - why? why not??
28
33
  def self.update_feeds
29
- Refresher.new.update_feeds
34
+ FeedRefresher.new.refresh_feeds
30
35
  end
31
36
 
32
37
  def self.update_sites
33
- Refresher.new.update_sites
38
+ SiteRefresher.new.refresh_sites
34
39
  end
35
40
 
36
41
  end # module Pluto
@@ -3,41 +3,26 @@
3
3
 
4
4
  module Pluto
5
5
 
6
- class Refresher
6
+ #######
7
+ # note: refresh
8
+ # refresh will fetch feeds, parse feeds and than update feeds
9
+ # (e.g. update is just one operation of refresh)
7
10
 
8
- include LogUtils::Logging
11
+ class FeedRefresher
9
12
 
13
+ include LogUtils::Logging
10
14
  include Models
11
15
 
12
16
  def initialize
13
- @worker = Fetcher.new
17
+ ## @worker = FeedFetcherBasic.new ## -- simple fetch (strategy); no cache, no cond get etc.
18
+ @worker = FeedFetcherCondGetWithCache.new
14
19
  end
15
20
 
16
21
  def debug=(value) @debug = value; end
17
22
  def debug?() @debug || false; end
18
23
 
19
24
 
20
- def update_sites( opts={} ) # update all site configs
21
- if debug?
22
- ## turn on logging for sql too
23
- ActiveRecord::Base.logger = Logger.new( STDOUT )
24
- @worker.debug = true # also pass along worker debug flag if set
25
- end
26
-
27
- start_time = Time.now
28
- Activity.create!( text: "start update sites (#{Site.count})" )
29
-
30
- #### - hack - use order(:id) instead of .all - avoids rails/activerecord 4 warnings
31
- Site.order(:id).each do |site|
32
- update_site_worker( site ) if site.url.present? # note: only update if (source) url present
33
- end
34
-
35
- total_secs = Time.now - start_time
36
- Activity.create!( text: "done update sites (#{Site.count}) in #{total_secs}s" )
37
- end
38
-
39
-
40
- def update_feeds( opts={} ) # update all feeds
25
+ def refresh_feeds( opts={} ) # refresh (fetch+parse+update) all feeds
41
26
  if debug?
42
27
  ## turn on logging for sql too
43
28
  ActiveRecord::Base.logger = Logger.new( STDOUT )
@@ -49,16 +34,16 @@ class Refresher
49
34
 
50
35
  #### - hack - use order(:id) instead of .all - avoids rails/activerecord 4 warnings
51
36
  Feed.order(:id).each do |feed|
52
- update_feed_worker( feed )
37
+ refresh_feed_worker( feed )
53
38
  ### todo/fix: add catch exception in loop and log to activity log and continue w/ next feed
54
39
  end
55
40
 
56
41
  total_secs = Time.now - start_time
57
- Activity.create!( text: "done update feeds (#{Site.count}) in #{total_secs}s" )
42
+ Activity.create!( text: "done update feeds (#{Feed.count}) in #{total_secs}s" )
58
43
  end
59
44
 
60
45
 
61
- def update_feeds_for( site_key, opts={} )
46
+ def refresh_feeds_for( site_key, opts={} ) # refresh (fetch+parse+update) feeds for site
62
47
  if debug?
63
48
  ## turn on logging for sql too
64
49
  ActiveRecord::Base.logger = Logger.new( STDOUT )
@@ -71,35 +56,23 @@ class Refresher
71
56
  site = Site.find_by_key!( site_key )
72
57
 
73
58
  site.feeds.each do |feed|
74
- update_feed_worker( feed )
59
+ refresh_feed_worker( feed )
75
60
  end
76
61
 
77
- end # method update_feeds
62
+ end # method refresh_feeds_for
78
63
 
79
64
 
80
65
  private
81
- def update_site_worker( site_rec )
82
- site_config = @worker.site_by_rec_if_modified( site_rec )
83
-
84
- # on error or if http-not modified etc. skip update/processing
85
- return if site_config.nil?
86
-
87
- subscriber = Subscriber.new
88
- subscriber.debug = debug? ? true : false # pass along debug flag
66
+ def refresh_feed_worker( feed_rec )
67
+ feed_xml = @worker.fetch( feed_rec )
89
68
 
90
- subscriber.update_subscriptions_for( site_rec.key, site_config )
91
- end
92
-
93
-
94
- def update_feed_worker( feed_rec )
95
- feed = @worker.feed_by_rec_if_modified( feed_rec )
96
-
97
69
  # on error or if http-not modified etc. skip update/processing
98
- return if feed.nil?
70
+ return if feed_xml.nil?
71
+
72
+ feed = FeedUtils::Parser.parse( feed_xml )
99
73
 
100
74
  ## fix/todo: reload feed_red - fetched date updated etc.
101
75
  ## check if needed for access to fetched date
102
-
103
76
 
104
77
  ## todo/check: move feed_rec update to the end (after item updates??)
105
78
 
@@ -107,7 +80,7 @@ private
107
80
  # generator
108
81
  # published_at,built_at,touched_at,fetched_at
109
82
  # summary,title2
110
-
83
+
111
84
  ## fix:
112
85
  ## weird rss exception error on windows w/ dates
113
86
  # e.g. /lib/ruby/1.9.1/rss/rss.rb:37:in `w3cdtf': wrong number of arguments (1 for 0) (ArgumentError)
@@ -117,9 +90,10 @@ private
117
90
 
118
91
 
119
92
  feed_rec.debug = debug? ? true : false # pass along debug flag
120
- ## fix/todo: pass debug flag as opts - debug: true|false !!!!!!
121
- feed_rec.save_from_struct!( feed ) # todo: find a better name - why? why not??
122
93
 
94
+ ## fix/todo: pass debug flag as opts - debug: true|false !!!!!!
95
+ # fix/todo: find a better name - why? why not?? => use update_from_struct!
96
+ feed_rec.save_from_struct!( feed )
123
97
 
124
98
  # update cached value last published for item
125
99
  last_item_rec = feed_rec.items.latest.limit(1).first # note limit(1) will return relation/arrar - use first to get first element or nil from ary
@@ -130,8 +104,8 @@ private
130
104
  feed_rec.update_attributes!( last_published: last_item_rec.touched )
131
105
  end
132
106
  end
133
- end # method update_feed_worker
107
+ end # method refresh_feed_worker
134
108
 
135
- end # class Refresher
109
+ end # class FeedRefresher
136
110
 
137
111
  end # module Pluto
@@ -0,0 +1,134 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Pluto
5
+
6
+ class SiteFetcher
7
+
8
+ include LogUtils::Logging
9
+ include Models # for easy convenience access for Activity etc.
10
+
11
+ def initialize
12
+ @worker = Fetcher::Worker.new
13
+ end
14
+
15
+ def debug=(value) @debug = value; end
16
+ def debug?() @debug || false; end
17
+
18
+ def fetch( site_rec )
19
+ ####################################################
20
+ # try smart http update; will update db records
21
+
22
+ site_url = site_rec.url
23
+ site_key = site_rec.key
24
+
25
+ ### todo/fix: normalize/unifiy feed_url
26
+ ## - same in fetcher - use shared utitlity method or similar
27
+
28
+ @worker.use_cache = true
29
+ @worker.cache[ site_url ] = {
30
+ 'etag' => site_rec.http_etag,
31
+ 'last-modified' => site_rec.http_last_modified
32
+ }
33
+
34
+ response = @worker.get( site_url )
35
+ @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
36
+
37
+ site_fetched = Time.now
38
+
39
+ ###
40
+ # Note: Net::HTTP will NOT set encoding UTF-8 etc.
41
+ # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
42
+ # thus, set/force encoding to utf-8
43
+ site_text = response.body.to_s
44
+ site_text = site_text.force_encoding( Encoding::UTF_8 )
45
+
46
+ if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
47
+
48
+ if site_text.index('@include')
49
+ ## note: if the site_text includes @include
50
+ ## we must revalidate complete file hierachy(tree) for now
51
+ ### continue;
52
+ ##
53
+ ## fix/todo: use ahead-of-time preprocessor ?? in the future to simplify???
54
+ else
55
+ puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
56
+ puts "no change; request returns not modified (304); skipping parsing site config"
57
+ return nil # no updates available; nothing to do
58
+ end
59
+
60
+ elsif response.code != '200' # note Net::HTTP response.code is a string in ruby
61
+
62
+ puts "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
63
+
64
+ site_attribs = {
65
+ http_code: response.code.to_i,
66
+ http_server: response.header[ 'server' ],
67
+ http_etag: nil,
68
+ http_last_modified: nil,
69
+ fetched: site_fetched
70
+ }
71
+ site_rec.update_attributes!( site_attribs )
72
+
73
+ ## add log error activity -- in future add to error log - better - why? why not?
74
+ Activity.create!( text: "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}" )
75
+
76
+ return nil # sorry; no feed for parsing available
77
+ else
78
+ # assume 200; continue w/ processing
79
+ end
80
+
81
+ puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
82
+
83
+ site_attribs = {
84
+ http_code: response.code.to_i,
85
+ http_server: response.header[ 'server' ],
86
+ http_etag: response.header[ 'etag' ],
87
+ http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
88
+ fetched: site_fetched
89
+ }
90
+
91
+ ## if debug?
92
+ puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
93
+ puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
94
+ puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
95
+ ## end
96
+
97
+ site_rec.update_attributes!( site_attribs )
98
+
99
+
100
+ #################
101
+ ### fix: add support for http_etag cache etc. - how??
102
+ ###
103
+ ### use from_text( text, base: base ) !!!!!!!!
104
+ ### do NOT reissue first request
105
+ ##
106
+ ## fix: use special case/method for update_with_includes!!!
107
+ ## keep it simple w/o includes (do NOT mix in one method)
108
+ ## split into two methods!!!
109
+
110
+ ## retry w/ preprocesser
111
+ ## refetch if @include found w/ all includes included
112
+ if site_text.index('@include')
113
+ site_text = InclPreproc.from_url( site_url ).read
114
+ end
115
+
116
+ ## logger.debug "site_text:"
117
+ ## logger.debug site_text[ 0..300 ] # get first 300 chars
118
+
119
+ site_text
120
+
121
+ ###
122
+ ## todo/fix:
123
+ ### move INI.load out of this method!! - return site_text or nil
124
+ ##
125
+ ## puts "Before parsing site config >#{site_key}<..."
126
+ ##
127
+ # assume ini format for now
128
+ ## site_config = INI.load( site_text )
129
+ ## site_config
130
+ end
131
+
132
+ end # class SiteFetcher
133
+
134
+ end # module Pluto
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Pluto
5
+
6
+ #######
7
+ # note: refresh
8
+ # refresh will fetch site subscriptions, parse and than update the site subscriptions
9
+ # (e.g. update is just one operation of refresh)
10
+
11
+
12
+ class SiteRefresher
13
+
14
+ include LogUtils::Logging
15
+ include Models
16
+
17
+ def initialize
18
+ @worker = SiteFetcher.new
19
+ end
20
+
21
+ def debug=(value) @debug = value; end
22
+ def debug?() @debug || false; end
23
+
24
+ def refresh_sites( opts={} ) # refresh (fetch+parse+update) all site configs
25
+ if debug?
26
+ ## turn on logging for sql too
27
+ ActiveRecord::Base.logger = Logger.new( STDOUT )
28
+ @worker.debug = true # also pass along worker debug flag if set
29
+ end
30
+
31
+ start_time = Time.now
32
+ Activity.create!( text: "start update sites (#{Site.count})" )
33
+
34
+ #### - hack - use order(:id) instead of .all - avoids rails/activerecord 4 warnings
35
+ Site.order(:id).each do |site|
36
+ refresh_site_worker( site ) if site.url.present? # note: only update if (source) url present
37
+ end
38
+
39
+ total_secs = Time.now - start_time
40
+ Activity.create!( text: "done update sites (#{Site.count}) in #{total_secs}s" )
41
+ end
42
+
43
+
44
+ private
45
+ def refresh_site_worker( site_rec )
46
+ site_text = @worker.fetch( site_rec )
47
+
48
+ # on error or if http-not modified etc. skip update/processing
49
+ return if site_text.nil?
50
+
51
+ site_config = INI.load( site_text )
52
+
53
+ site_updater = SiteUpdater.new
54
+ site_updater.debug = debug? ? true : false # pass along debug flag
55
+
56
+ ### todo/fix:
57
+ ## allow passing in as first arg database rec!!!
58
+ ## - if passed in database rec - do NOT lookup record by site_config.key!!!
59
+ ## use existing key (lets you change/update key without creating a new duplicate site entry, for example)
60
+ ## - or use a new method (instead of overloading arg) ?? - why? why not??
61
+ site_updater.update_subscriptions_for( site_rec.key, site_config )
62
+ end
63
+
64
+ end # class SiteRefresher
65
+
66
+ end # module Pluto
67
+
@@ -3,10 +3,9 @@
3
3
 
4
4
  module Pluto
5
5
 
6
- class Subscriber
6
+ class SiteUpdater
7
7
 
8
8
  include LogUtils::Logging
9
-
10
9
  include Models
11
10
 
12
11
  def debug=(value) @debug = value; end
@@ -156,8 +155,8 @@ class Subscriber
156
155
  site_rec.subscriptions.create!( feed_id: feed_rec.id )
157
156
  end
158
157
 
159
- end # method update_subscriptions
158
+ end # method update_subscriptions_for
160
159
 
161
- end # class Subscriber
160
+ end # class SiteUpdater
162
161
 
163
162
  end # module Pluto
@@ -4,7 +4,7 @@
4
4
  module PlutoUpdate
5
5
 
6
6
  MAJOR = 1
7
- MINOR = 4
7
+ MINOR = 5
8
8
  PATCH = 0
9
9
  VERSION = [MAJOR,MINOR,PATCH].join('.')
10
10
 
@@ -0,0 +1,18 @@
1
+ title = Planet Ruby
2
+ source = https://github.com/feedreader/pluto.update/raw/master/test/data/ruby.ini
3
+
4
+ [rubylang]
5
+ title = Ruby Lang News
6
+ link = http://www.ruby-lang.org/en/news
7
+ feed = http://www.ruby-lang.org/en/feeds/news.rss
8
+
9
+ [rubyonrails]
10
+ title = Ruby on Rails News
11
+ link = http://weblog.rubyonrails.org
12
+ feed = http://weblog.rubyonrails.org/feed/atom.xml
13
+
14
+ [viennarb]
15
+ title = Vienna.rb News
16
+ link = http://vienna-rb.at
17
+ feed = http://vienna-rb.at/atom.xml
18
+
@@ -0,0 +1,25 @@
1
+ ## $:.unshift(File.dirname(__FILE__))
2
+
3
+
4
+ ## minitest setup
5
+
6
+ require 'minitest/autorun'
7
+
8
+ ## our own code
9
+ require 'pluto/update'
10
+
11
+
12
+ LogUtils::Logger.root.level = :debug
13
+
14
+
15
+ ## some shortcuts
16
+ Log = LogDb::Model::Log
17
+ Prop = ConfDb::Model::Prop
18
+
19
+ Site = Pluto::Model::Site
20
+ Feed = Pluto::Model::Feed
21
+ Item = Pluto::Model::Item
22
+ Subscription = Pluto::Model::Subscription
23
+
24
+
25
+ Pluto.setup_in_memory_db
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_refresh.rb
6
+ # or better
7
+ # rake test
8
+
9
+ require 'helper'
10
+
11
+ class TestRefresh < MiniTest::Test
12
+
13
+ def setup
14
+ Site.delete_all
15
+ Feed.delete_all
16
+ Item.delete_all
17
+ Subscription.delete_all
18
+
19
+ site_text = File.read( "#{PlutoUpdate.root}/test/data/ruby.ini")
20
+ site_config = INI.load( site_text )
21
+
22
+ site_updater = Pluto::SiteUpdater.new
23
+ site_updater.update_subscriptions_for( 'ruby', site_config )
24
+ end
25
+
26
+
27
+ def test_refresh_sites
28
+ Pluto.refresh_sites
29
+ assert true
30
+ end
31
+
32
+ def test_refresh_feeds
33
+ Pluto.refresh_feeds
34
+ assert true
35
+ end
36
+
37
+ end # class TestRefresh
38
+
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_site.rb
6
+ # or better
7
+ # rake test
8
+
9
+ require 'helper'
10
+
11
+ class TestSite < MiniTest::Test
12
+
13
+ def setup
14
+ Site.delete_all
15
+ Feed.delete_all
16
+ Item.delete_all
17
+ Subscription.delete_all
18
+ end
19
+
20
+
21
+ def test_site_updater
22
+ site_text = File.read( "#{PlutoUpdate.root}/test/data/ruby.ini")
23
+ site_config = INI.load( site_text )
24
+ pp site_config
25
+
26
+ assert_equal 0, Site.count
27
+ assert_equal 0, Feed.count
28
+
29
+ site_updater = Pluto::SiteUpdater.new
30
+ site_updater.update_subscriptions_for( 'ruby', site_config )
31
+
32
+ assert_equal 1, Site.count
33
+ assert_equal 3, Feed.count
34
+
35
+ ruby = Site.find_by_key!( 'ruby' )
36
+ assert_equal 'Planet Ruby', ruby.title
37
+ assert_equal 3, ruby.subscriptions.count
38
+ assert_equal 3, ruby.feeds.count
39
+
40
+ rubylang = Feed.find_by_key!( 'rubylang' )
41
+ assert_equal 'Ruby Lang News', rubylang.title
42
+ assert_equal 'http://www.ruby-lang.org/en/news', rubylang.url
43
+ assert_equal 'http://www.ruby-lang.org/en/feeds/news.rss', rubylang.feed_url
44
+ end
45
+
46
+ end # class TestSite
47
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pluto-update
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-09 00:00:00.000000000 Z
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pluto-models
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: pluto-feedfetcher
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.0
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: fetcher
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -89,16 +103,21 @@ extra_rdoc_files:
89
103
  - Manifest.txt
90
104
  - README.md
91
105
  files:
106
+ - ".gemtest"
92
107
  - HISTORY.md
93
108
  - Manifest.txt
94
109
  - README.md
95
110
  - Rakefile
96
111
  - lib/pluto/update.rb
97
- - lib/pluto/update/fetcher.rb
98
- - lib/pluto/update/refresher.rb
99
- - lib/pluto/update/subscriber.rb
100
- - lib/pluto/update/updater.rb
112
+ - lib/pluto/update/feed_refresher.rb
113
+ - lib/pluto/update/site_fetcher.rb
114
+ - lib/pluto/update/site_refresher.rb
115
+ - lib/pluto/update/site_updater.rb
101
116
  - lib/pluto/update/version.rb
117
+ - test/data/ruby.ini
118
+ - test/helper.rb
119
+ - test/test_refresh.rb
120
+ - test/test_site.rb
102
121
  homepage: https://github.com/feedreader/pluto.update
103
122
  licenses:
104
123
  - Public Domain
@@ -125,4 +144,6 @@ rubygems_version: 2.4.2
125
144
  signing_key:
126
145
  specification_version: 4
127
146
  summary: pluto-update - planet feed 'n' subscription updater
128
- test_files: []
147
+ test_files:
148
+ - test/test_site.rb
149
+ - test/test_refresh.rb
@@ -1,357 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Pluto
5
-
6
- class Fetcher
7
-
8
- include LogUtils::Logging
9
-
10
- include Models # for easy convenience access for Activity etc.
11
-
12
- def initialize
13
- @worker = ::Fetcher::Worker.new
14
- end
15
-
16
- def debug=(value) @debug = value; end
17
- def debug?() @debug || false; end
18
-
19
-
20
- def fetch_feed( url )
21
- response = @worker.get( url )
22
-
23
- ## if debug?
24
- puts "http status #{response.code} #{response.message}"
25
-
26
- puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
27
- puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
28
- puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
29
- ## end
30
-
31
- xml = response.body
32
-
33
- ###
34
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
35
- # will mostly be ASCII
36
- # - try to change encoding to UTF-8 ourselves
37
- logger.debug "xml.encoding.name (before): #{xml.encoding.name}"
38
-
39
- #####
40
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
41
-
42
- ## NB:
43
- # for now "hardcoded" to utf8 - what else can we do?
44
- # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
45
- xml = xml.force_encoding( Encoding::UTF_8 )
46
- logger.debug "xml.encoding.name (after): #{xml.encoding.name}"
47
-
48
- xml
49
- end
50
-
51
-
52
- def feed_by_rec( feed_rec )
53
- # simple feed fetcher; use for debugging (only/mostly)
54
- # -- will NOT change db records in any way
55
-
56
- feed_url = feed_rec.feed_url
57
- feed_key = feed_rec.key
58
-
59
- feed_xml = fetch_feed( feed_url )
60
-
61
- logger.debug "feed_xml:"
62
- logger.debug feed_xml[ 0..500 ] # get first 500 chars
63
-
64
- # if opts.verbose? # also write a copy to disk
65
- if debug?
66
- logger.debug "saving feed to >./#{feed_key}.xml<..."
67
- File.open( "./#{feed_key}.xml", 'w' ) do |f|
68
- f.write( feed_xml )
69
- end
70
- end
71
-
72
- puts "Before parsing feed >#{feed_key}<..."
73
-
74
- ## fix/todo: check for feed.nil? -> error parsing!!!
75
- # or throw exception
76
- feed = FeedUtils::Parser.parse( feed_xml )
77
- feed
78
- end
79
-
80
-
81
- def feed_by_rec_if_modified( feed_rec ) # try smart http update; will update db records
82
- feed_url = feed_rec.feed_url
83
- feed_key = feed_rec.key
84
-
85
- ### todo/fix: normalize/unifiy feed_url
86
- ## - same in fetcher - use shared utitlity method or similar
87
-
88
- @worker.use_cache = true
89
- @worker.cache[ feed_url ] = {
90
- 'etag' => feed_rec.http_etag,
91
- 'last-modified' => feed_rec.http_last_modified
92
- }
93
-
94
- begin
95
- response = @worker.get( feed_url )
96
- rescue SocketError => e
97
- ## catch socket error for unknown domain names (e.g. pragdave.blogs.pragprog.com)
98
- ### will result in SocketError -- getaddrinfo: Name or service not known
99
- puts "*** error: fetching feed '#{feed_key}' - #{e.to_s}"
100
- Activity.create!( text: "*** error: fetching feed '#{feed_key}' - #{e.to_s}" )
101
-
102
- ### todo/fix: update feed rec in db
103
- @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
104
- return nil
105
- end
106
-
107
- @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
108
-
109
- if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
110
- puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
111
- puts "no change; request returns not modified (304); skipping parsing feed"
112
- return nil # no updates available; nothing to do
113
- end
114
-
115
- feed_fetched = Time.now
116
-
117
- if response.code != '200' # note Net::HTTP response.code is a string in ruby
118
-
119
- puts "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
120
-
121
- feed_attribs = {
122
- http_code: response.code.to_i,
123
- http_server: response.header[ 'server' ],
124
- http_etag: nil,
125
- http_last_modified: nil,
126
- body: nil,
127
- md5: nil,
128
- fetched: feed_fetched
129
- }
130
- feed_rec.update_attributes!( feed_attribs )
131
-
132
- ## add log error activity -- in future add to error log - better - why? why not?
133
- Activity.create!( text: "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}" )
134
-
135
- return nil # sorry; no feed for parsing available
136
- end
137
-
138
- puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
139
-
140
- feed_xml = response.body
141
- ###
142
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
143
- # will mostly be ASCII
144
- # - try to change encoding to UTF-8 ourselves
145
- logger.debug "feed_xml.encoding.name (before): #{feed_xml.encoding.name}"
146
-
147
-
148
- #####
149
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
150
-
151
- # try Converting ASCII-8BIT to UTF-8 based domain-specific guesses
152
- begin
153
- # Try it as UTF-8 directly
154
- # Note: make a copy/dup - otherwise convert fails (because string is already changed/corrupted)
155
- feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::UTF_8 )
156
- unless feed_xml_cleaned.valid_encoding?
157
-
158
- puts "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1"
159
- Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1" )
160
- # Some of it might be old Windows code page
161
- # -- (Windows Code Page CP1252 is ISO_8859_1 is Latin-1 - check ??)
162
-
163
- # tell ruby the encoding
164
- # encode to utf-8
165
- ## use all in code encode ?? e.g. feed_xml_cleaned = feed_xml.encode( Encoding::UTF_8, Encoding::ISO_8859_1 )
166
- feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::ISO_8859_1 ).encode( Encoding::UTF_8 )
167
- end
168
- feed_xml = feed_xml_cleaned
169
- rescue EncodingError => e
170
- puts "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}"
171
- Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}" )
172
-
173
- # Force it to UTF-8, throwing out invalid bits
174
- ## todo: check options - add ?? or something to mark invalid chars ???
175
- feed_xml.encode!( Encoding::UTF_8, :invalid => :replace, :undef => :replace )
176
- end
177
-
178
- ## NB:
179
- # for now "hardcoded" to utf8 - what else can we do?
180
- # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
181
- ### old "simple" version
182
- ## feed_xml = feed_xml.force_encoding( Encoding::UTF_8 )
183
-
184
-
185
- logger.debug "feed_xml.encoding.name (after): #{feed_xml.encoding.name}"
186
-
187
- ## check for md5 hash for response.body
188
-
189
- last_feed_md5 = feed_rec.md5
190
- feed_md5 = Digest::MD5.hexdigest( feed_xml )
191
-
192
- if last_feed_md5 && last_feed_md5 == feed_md5
193
- # not all servers handle conditional gets, so while not much can be
194
- # done about the bandwidth, but if the response body is identical
195
- # the downstream processing (parsing, caching, ...) can be avoided.
196
- # - thanks to planet mars -fido.rb for the idea, cheers.
197
-
198
- puts "no change; md5 digests match; skipping parsing feed"
199
- return nil # no updates available; nothing to do
200
- end
201
-
202
- feed_attribs = {
203
- http_code: response.code.to_i,
204
- http_server: response.header[ 'server' ],
205
- http_etag: response.header[ 'etag' ],
206
- http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
207
- body: feed_xml,
208
- md5: feed_md5,
209
- fetched: feed_fetched
210
- }
211
-
212
- ## if debug?
213
- puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
214
- puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
215
- puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
216
- ## end
217
-
218
- ### note: might crash w/ encoding errors when saving in postgress
219
- ## e.g. PG::CharacterNotInRepertoire: ERROR: ...
220
- ## catch error, log it and stop for now
221
- #
222
- # in the future check for different charset than utf-8 ?? possible?? how to deal with non-utf8 charsets??
223
-
224
- begin
225
- feed_rec.update_attributes!( feed_attribs )
226
- rescue Exception => e
227
- # log db error; and continue
228
- puts "*** error: updating feed database record '#{feed_key}' - #{e.to_s}"
229
- Activity.create!( text: "*** error: updating feed database record '#{feed_key}' - #{e.to_s}" )
230
- return nil # sorry; corrupt feed; parsing not possible; fix char encoding - make it an option in config??
231
- end
232
-
233
-
234
- logger.debug "feed_xml:"
235
- logger.debug feed_xml[ 0..300 ] # get first 300 chars
236
-
237
- puts "Before parsing feed >#{feed_key}<..."
238
-
239
- ### move to feedutils
240
- ### logger.debug "using stdlib RSS::VERSION #{RSS::VERSION}"
241
-
242
- ## fix/todo: check for feed.nil? -> error parsing!!!
243
- # or throw exception
244
- feed = FeedUtils::Parser.parse( feed_xml )
245
- feed
246
- end
247
-
248
-
249
- def site_by_rec_if_modified( site_rec ) # try smart http update; will update db records
250
- site_url = site_rec.url
251
- site_key = site_rec.key
252
-
253
- ### todo/fix: normalize/unifiy feed_url
254
- ## - same in fetcher - use shared utitlity method or similar
255
-
256
- @worker.use_cache = true
257
- @worker.cache[ site_url ] = {
258
- 'etag' => site_rec.http_etag,
259
- 'last-modified' => site_rec.http_last_modified
260
- }
261
-
262
- response = @worker.get( site_url )
263
- @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
264
-
265
- site_fetched = Time.now
266
-
267
- ###
268
- # Note: Net::HTTP will NOT set encoding UTF-8 etc.
269
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
270
- # thus, set/force encoding to utf-8
271
- site_text = response.body.to_s
272
- site_text = site_text.force_encoding( Encoding::UTF_8 )
273
-
274
- if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
275
-
276
- if site_text.index('@include')
277
- ## note: if the site_text includes @include
278
- ## we must revalidate complete file hierachy(tree) for now
279
- ### continue;
280
- ##
281
- ## fix/todo: use ahead-of-time preprocessor ?? in the future to simplify???
282
- else
283
- puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
284
- puts "no change; request returns not modified (304); skipping parsing site config"
285
- return nil # no updates available; nothing to do
286
- end
287
-
288
- elsif response.code != '200' # note Net::HTTP response.code is a string in ruby
289
-
290
- puts "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
291
-
292
- site_attribs = {
293
- http_code: response.code.to_i,
294
- http_server: response.header[ 'server' ],
295
- http_etag: nil,
296
- http_last_modified: nil,
297
- fetched: site_fetched
298
- }
299
- site_rec.update_attributes!( site_attribs )
300
-
301
- ## add log error activity -- in future add to error log - better - why? why not?
302
- Activity.create!( text: "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}" )
303
-
304
- return nil # sorry; no feed for parsing available
305
- else
306
- # assume 200; continue w/ processing
307
- end
308
-
309
- puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
310
-
311
- site_attribs = {
312
- http_code: response.code.to_i,
313
- http_server: response.header[ 'server' ],
314
- http_etag: response.header[ 'etag' ],
315
- http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
316
- fetched: site_fetched
317
- }
318
-
319
- ## if debug?
320
- puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
321
- puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
322
- puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
323
- ## end
324
-
325
- site_rec.update_attributes!( site_attribs )
326
-
327
-
328
- #################
329
- ### fix: add support for http_etag cache etc. - how??
330
- ###
331
- ### use from_text( text, base: base ) !!!!!!!!
332
- ### do NOT reissue first request
333
- ##
334
- ## fix: use special case/method for update_with_includes!!!
335
- ## keep it simple w/o includes (do NOT mix in one method)
336
- ## split into two methods!!!
337
-
338
- ## retry w/ preprocesser
339
- ## refetch if @include found w/ all includes included
340
- if site_text.index('@include')
341
- site_text = InclPreproc.from_url( site_url ).read
342
- end
343
-
344
- ## logger.debug "site_text:"
345
- ## logger.debug site_text[ 0..300 ] # get first 300 chars
346
-
347
-
348
- puts "Before parsing site config >#{site_key}<..."
349
-
350
- # assume ini format for now
351
- site_config = INI.load( site_text )
352
- site_config
353
- end
354
-
355
- end # class Fetcher
356
-
357
- end # module Pluto
@@ -1,62 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Pluto
5
-
6
- class Updater
7
-
8
- include LogUtils::Logging
9
-
10
- ### fix!!!!!: change config to text - yes/no - why? why not??
11
- # or pass along struct
12
- # - with hash and text and format(e.g. ini) as fields???
13
- #
14
- # - why? - we need to get handle on md5 digest/hash plus on plain text, ideally to store in db
15
- ## - pass along unparsed text!! - not hash struct
16
- # - will get saved in db plus we need to generate md5 hash
17
- # - add filename e.g. ruby.ini|ruby.conf as opt ??
18
- # or add config format as opt e.g. ini?
19
-
20
- def initialize( opts, config )
21
- @opts = opts
22
- @config = config
23
- end
24
-
25
- attr_reader :opts, :config
26
-
27
- def run( arg )
28
- arg = arg.downcase.gsub('.ini','') # remove file extension if present
29
-
30
- update_for( arg )
31
- end
32
-
33
- def update_for( name )
34
-
35
- ## note: allow (optional) config of site key too
36
- site_key = config['key'] || config['slug']
37
- if site_key.nil?
38
- ## if no key configured; use (file)name; remove -_ chars
39
- ## e.g. jekyll-meta becomes jekyllmeta etc.
40
- site_key = name.downcase.gsub( /[\-_]/, '' )
41
- end
42
-
43
- ###################
44
- # step 1) update subscriptions
45
- subscriber = Subscriber.new
46
-
47
- # pass along debug/verbose setting/switch
48
- subscriber.debug = true if opts.verbose?
49
- subscriber.update_subscriptions_for( site_key, config )
50
-
51
- ##############################
52
- # step 2) update feeds
53
- refresher = Refresher.new
54
-
55
- # pass along debug/verbose setting/switch
56
- refresher.debug = true if opts.verbose?
57
- refresher.update_feeds_for( site_key )
58
- end # method run
59
-
60
- end # class Updater
61
-
62
- end # module Pluto