pluto-update 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d631c3eb45ac43efb9e2d0186270a5ff6adc0e43
4
- data.tar.gz: bb4774dcb0d97bc05b3f19f05956f608d94bf18d
3
+ metadata.gz: 49adc91fcbc3b7df6d0ee5aed048fa0df4abb186
4
+ data.tar.gz: 16b22d7a9993c287f592176acd9991ac5d42e3c5
5
5
  SHA512:
6
- metadata.gz: 858c5e290c621b417ab5773e3c307daed4a0c0174d00ef88729ac9a235deafc5e0c42027eaaf86220017415b9538be04c6661a7f989fedd5c87c5143c97e2fe9
7
- data.tar.gz: 228ffdf0626f7455c73bc108bdc4639c5f2ff42d468238d2d6dcd308a8e0df3aead6f0f2cecf0d374a36b378b3babba0353ad10f26d5e8ca81ef4d102117302b
6
+ metadata.gz: c0db8f9b7dcb72009df75b4cc001fe522e579f4b55bc8292b7ab5d19f4166aead75084b0da9356444dddd00b999a372e9972abaedf9230497706db138d766f80
7
+ data.tar.gz: 0fc2ba83226d96ca4b9a71829079c7232322997963230a99cda0948ae4591dc4071725ddc287a2485dcdb083ec8fc1015c0399a50238ff78901f0dd39d908d00
File without changes
@@ -3,8 +3,12 @@ Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
5
  lib/pluto/update.rb
6
- lib/pluto/update/fetcher.rb
7
- lib/pluto/update/refresher.rb
8
- lib/pluto/update/subscriber.rb
9
- lib/pluto/update/updater.rb
6
+ lib/pluto/update/feed_refresher.rb
7
+ lib/pluto/update/site_fetcher.rb
8
+ lib/pluto/update/site_refresher.rb
9
+ lib/pluto/update/site_updater.rb
10
10
  lib/pluto/update/version.rb
11
+ test/data/ruby.ini
12
+ test/helper.rb
13
+ test/test_refresh.rb
14
+ test/test_site.rb
data/README.md CHANGED
@@ -17,10 +17,10 @@
17
17
  ```
18
18
  title = Planet Ruby
19
19
 
20
- [rubyflow]
21
- title = Ruby Flow
22
- link = http://rubyflow.com
23
- feed = http://feeds.feedburner.com/Rubyflow?format=xml
20
+ [rubylang]
21
+ title = Ruby Lang News
22
+ link = http://www.ruby-lang.org/en/news
23
+ feed = http://www.ruby-lang.org/en/feeds/news.rss
24
24
 
25
25
  [rubyonrails]
26
26
  title = Ruby on Rails Blog
data/Rakefile CHANGED
@@ -18,9 +18,10 @@ Hoe.spec 'pluto-update' do
18
18
  self.history_file = 'HISTORY.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['pluto-models', '>= 1.3.2'],
22
- ['fetcher', '>= 0.4.4'],
23
- ['preproc', '>= 0.1.0'],
21
+ ['pluto-models', '>= 1.3.2'],
22
+ ['pluto-feedfetcher', '>= 0.1.0'],
23
+ ['fetcher', '>= 0.4.4'],
24
+ ['preproc', '>= 0.1.0'],
24
25
  ]
25
26
 
26
27
  self.licenses = ['Public Domain']
@@ -2,6 +2,7 @@
2
2
 
3
3
 
4
4
  require 'pluto/models'
5
+ require 'pluto/feedfetcher'
5
6
 
6
7
 
7
8
  # more 3rd party gems
@@ -11,26 +12,30 @@ require 'preproc' # include preprocessor
11
12
 
12
13
  # our own code
13
14
  require 'pluto/update/version' # Note: let version always go first
14
- require 'pluto/update/fetcher'
15
- require 'pluto/update/refresher'
16
- require 'pluto/update/subscriber'
17
- require 'pluto/update/updater'
15
+ require 'pluto/update/feed_refresher'
16
+ require 'pluto/update/site_refresher'
17
+ require 'pluto/update/site_fetcher'
18
+ require 'pluto/update/site_updater'
18
19
 
19
20
 
20
21
 
21
22
  module Pluto
22
23
 
23
- # todo: add alias update_site( config ) ??
24
- def self.update_subscriptions( config )
25
- Subscriber.new.update_subscriptions( config )
24
+ def self.refresh_feeds ## refresh == fetch+parse+update
25
+ FeedRefresher.new.refresh_feeds
26
26
  end
27
27
 
28
+ def self.refresh_sites ## refresh == fetch+parse+update
29
+ SiteRefresher.new.refresh_sites
30
+ end
31
+
32
+ ### convenience alias w/ update_ -- use refresh (only) - why? why not??
28
33
  def self.update_feeds
29
- Refresher.new.update_feeds
34
+ FeedRefresher.new.refresh_feeds
30
35
  end
31
36
 
32
37
  def self.update_sites
33
- Refresher.new.update_sites
38
+ SiteRefresher.new.refresh_sites
34
39
  end
35
40
 
36
41
  end # module Pluto
@@ -3,41 +3,26 @@
3
3
 
4
4
  module Pluto
5
5
 
6
- class Refresher
6
+ #######
7
+ # note: refresh
8
+ # refresh will fetch feeds, parse feeds and than update feeds
9
+ # (e.g. update is just one operation of refresh)
7
10
 
8
- include LogUtils::Logging
11
+ class FeedRefresher
9
12
 
13
+ include LogUtils::Logging
10
14
  include Models
11
15
 
12
16
  def initialize
13
- @worker = Fetcher.new
17
+ ## @worker = FeedFetcherBasic.new ## -- simple fetch (strategy); no cache, no cond get etc.
18
+ @worker = FeedFetcherCondGetWithCache.new
14
19
  end
15
20
 
16
21
  def debug=(value) @debug = value; end
17
22
  def debug?() @debug || false; end
18
23
 
19
24
 
20
- def update_sites( opts={} ) # update all site configs
21
- if debug?
22
- ## turn on logging for sql too
23
- ActiveRecord::Base.logger = Logger.new( STDOUT )
24
- @worker.debug = true # also pass along worker debug flag if set
25
- end
26
-
27
- start_time = Time.now
28
- Activity.create!( text: "start update sites (#{Site.count})" )
29
-
30
- #### - hack - use order(:id) instead of .all - avoids rails/activerecord 4 warnings
31
- Site.order(:id).each do |site|
32
- update_site_worker( site ) if site.url.present? # note: only update if (source) url present
33
- end
34
-
35
- total_secs = Time.now - start_time
36
- Activity.create!( text: "done update sites (#{Site.count}) in #{total_secs}s" )
37
- end
38
-
39
-
40
- def update_feeds( opts={} ) # update all feeds
25
+ def refresh_feeds( opts={} ) # refresh (fetch+parse+update) all feeds
41
26
  if debug?
42
27
  ## turn on logging for sql too
43
28
  ActiveRecord::Base.logger = Logger.new( STDOUT )
@@ -49,16 +34,16 @@ class Refresher
49
34
 
50
35
  #### - hack - use order(:id) instead of .all - avoids rails/activerecord 4 warnings
51
36
  Feed.order(:id).each do |feed|
52
- update_feed_worker( feed )
37
+ refresh_feed_worker( feed )
53
38
  ### todo/fix: add catch exception in loop and log to activity log and continue w/ next feed
54
39
  end
55
40
 
56
41
  total_secs = Time.now - start_time
57
- Activity.create!( text: "done update feeds (#{Site.count}) in #{total_secs}s" )
42
+ Activity.create!( text: "done update feeds (#{Feed.count}) in #{total_secs}s" )
58
43
  end
59
44
 
60
45
 
61
- def update_feeds_for( site_key, opts={} )
46
+ def refresh_feeds_for( site_key, opts={} ) # refresh (fetch+parse+update) feeds for site
62
47
  if debug?
63
48
  ## turn on logging for sql too
64
49
  ActiveRecord::Base.logger = Logger.new( STDOUT )
@@ -71,35 +56,23 @@ class Refresher
71
56
  site = Site.find_by_key!( site_key )
72
57
 
73
58
  site.feeds.each do |feed|
74
- update_feed_worker( feed )
59
+ refresh_feed_worker( feed )
75
60
  end
76
61
 
77
- end # method update_feeds
62
+ end # method refresh_feeds_for
78
63
 
79
64
 
80
65
  private
81
- def update_site_worker( site_rec )
82
- site_config = @worker.site_by_rec_if_modified( site_rec )
83
-
84
- # on error or if http-not modified etc. skip update/processing
85
- return if site_config.nil?
86
-
87
- subscriber = Subscriber.new
88
- subscriber.debug = debug? ? true : false # pass along debug flag
66
+ def refresh_feed_worker( feed_rec )
67
+ feed_xml = @worker.fetch( feed_rec )
89
68
 
90
- subscriber.update_subscriptions_for( site_rec.key, site_config )
91
- end
92
-
93
-
94
- def update_feed_worker( feed_rec )
95
- feed = @worker.feed_by_rec_if_modified( feed_rec )
96
-
97
69
  # on error or if http-not modified etc. skip update/processing
98
- return if feed.nil?
70
+ return if feed_xml.nil?
71
+
72
+ feed = FeedUtils::Parser.parse( feed_xml )
99
73
 
100
74
  ## fix/todo: reload feed_red - fetched date updated etc.
101
75
  ## check if needed for access to fetched date
102
-
103
76
 
104
77
  ## todo/check: move feed_rec update to the end (after item updates??)
105
78
 
@@ -107,7 +80,7 @@ private
107
80
  # generator
108
81
  # published_at,built_at,touched_at,fetched_at
109
82
  # summary,title2
110
-
83
+
111
84
  ## fix:
112
85
  ## weird rss exception error on windows w/ dates
113
86
  # e.g. /lib/ruby/1.9.1/rss/rss.rb:37:in `w3cdtf': wrong number of arguments (1 for 0) (ArgumentError)
@@ -117,9 +90,10 @@ private
117
90
 
118
91
 
119
92
  feed_rec.debug = debug? ? true : false # pass along debug flag
120
- ## fix/todo: pass debug flag as opts - debug: true|false !!!!!!
121
- feed_rec.save_from_struct!( feed ) # todo: find a better name - why? why not??
122
93
 
94
+ ## fix/todo: pass debug flag as opts - debug: true|false !!!!!!
95
+ # fix/todo: find a better name - why? why not?? => use update_from_struct!
96
+ feed_rec.save_from_struct!( feed )
123
97
 
124
98
  # update cached value last published for item
125
99
  last_item_rec = feed_rec.items.latest.limit(1).first # note limit(1) will return relation/arrar - use first to get first element or nil from ary
@@ -130,8 +104,8 @@ private
130
104
  feed_rec.update_attributes!( last_published: last_item_rec.touched )
131
105
  end
132
106
  end
133
- end # method update_feed_worker
107
+ end # method refresh_feed_worker
134
108
 
135
- end # class Refresher
109
+ end # class FeedRefresher
136
110
 
137
111
  end # module Pluto
@@ -0,0 +1,134 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Pluto
5
+
6
+ class SiteFetcher
7
+
8
+ include LogUtils::Logging
9
+ include Models # for easy convenience access for Activity etc.
10
+
11
+ def initialize
12
+ @worker = Fetcher::Worker.new
13
+ end
14
+
15
+ def debug=(value) @debug = value; end
16
+ def debug?() @debug || false; end
17
+
18
+ def fetch( site_rec )
19
+ ####################################################
20
+ # try smart http update; will update db records
21
+
22
+ site_url = site_rec.url
23
+ site_key = site_rec.key
24
+
25
+ ### todo/fix: normalize/unifiy feed_url
26
+ ## - same in fetcher - use shared utitlity method or similar
27
+
28
+ @worker.use_cache = true
29
+ @worker.cache[ site_url ] = {
30
+ 'etag' => site_rec.http_etag,
31
+ 'last-modified' => site_rec.http_last_modified
32
+ }
33
+
34
+ response = @worker.get( site_url )
35
+ @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
36
+
37
+ site_fetched = Time.now
38
+
39
+ ###
40
+ # Note: Net::HTTP will NOT set encoding UTF-8 etc.
41
+ # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
42
+ # thus, set/force encoding to utf-8
43
+ site_text = response.body.to_s
44
+ site_text = site_text.force_encoding( Encoding::UTF_8 )
45
+
46
+ if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
47
+
48
+ if site_text.index('@include')
49
+ ## note: if the site_text includes @include
50
+ ## we must revalidate complete file hierachy(tree) for now
51
+ ### continue;
52
+ ##
53
+ ## fix/todo: use ahead-of-time preprocessor ?? in the future to simplify???
54
+ else
55
+ puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
56
+ puts "no change; request returns not modified (304); skipping parsing site config"
57
+ return nil # no updates available; nothing to do
58
+ end
59
+
60
+ elsif response.code != '200' # note Net::HTTP response.code is a string in ruby
61
+
62
+ puts "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
63
+
64
+ site_attribs = {
65
+ http_code: response.code.to_i,
66
+ http_server: response.header[ 'server' ],
67
+ http_etag: nil,
68
+ http_last_modified: nil,
69
+ fetched: site_fetched
70
+ }
71
+ site_rec.update_attributes!( site_attribs )
72
+
73
+ ## add log error activity -- in future add to error log - better - why? why not?
74
+ Activity.create!( text: "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}" )
75
+
76
+ return nil # sorry; no feed for parsing available
77
+ else
78
+ # assume 200; continue w/ processing
79
+ end
80
+
81
+ puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
82
+
83
+ site_attribs = {
84
+ http_code: response.code.to_i,
85
+ http_server: response.header[ 'server' ],
86
+ http_etag: response.header[ 'etag' ],
87
+ http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
88
+ fetched: site_fetched
89
+ }
90
+
91
+ ## if debug?
92
+ puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
93
+ puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
94
+ puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
95
+ ## end
96
+
97
+ site_rec.update_attributes!( site_attribs )
98
+
99
+
100
+ #################
101
+ ### fix: add support for http_etag cache etc. - how??
102
+ ###
103
+ ### use from_text( text, base: base ) !!!!!!!!
104
+ ### do NOT reissue first request
105
+ ##
106
+ ## fix: use special case/method for update_with_includes!!!
107
+ ## keep it simple w/o includes (do NOT mix in one method)
108
+ ## split into two methods!!!
109
+
110
+ ## retry w/ preprocesser
111
+ ## refetch if @include found w/ all includes included
112
+ if site_text.index('@include')
113
+ site_text = InclPreproc.from_url( site_url ).read
114
+ end
115
+
116
+ ## logger.debug "site_text:"
117
+ ## logger.debug site_text[ 0..300 ] # get first 300 chars
118
+
119
+ site_text
120
+
121
+ ###
122
+ ## todo/fix:
123
+ ### move INI.load out of this method!! - return site_text or nil
124
+ ##
125
+ ## puts "Before parsing site config >#{site_key}<..."
126
+ ##
127
+ # assume ini format for now
128
+ ## site_config = INI.load( site_text )
129
+ ## site_config
130
+ end
131
+
132
+ end # class SiteFetcher
133
+
134
+ end # module Pluto
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Pluto
5
+
6
+ #######
7
+ # note: refresh
8
+ # refresh will fetch site subscriptions, parse and than update the site subscriptions
9
+ # (e.g. update is just one operation of refresh)
10
+
11
+
12
+ class SiteRefresher
13
+
14
+ include LogUtils::Logging
15
+ include Models
16
+
17
+ def initialize
18
+ @worker = SiteFetcher.new
19
+ end
20
+
21
+ def debug=(value) @debug = value; end
22
+ def debug?() @debug || false; end
23
+
24
+ def refresh_sites( opts={} ) # refresh (fetch+parse+update) all site configs
25
+ if debug?
26
+ ## turn on logging for sql too
27
+ ActiveRecord::Base.logger = Logger.new( STDOUT )
28
+ @worker.debug = true # also pass along worker debug flag if set
29
+ end
30
+
31
+ start_time = Time.now
32
+ Activity.create!( text: "start update sites (#{Site.count})" )
33
+
34
+ #### - hack - use order(:id) instead of .all - avoids rails/activerecord 4 warnings
35
+ Site.order(:id).each do |site|
36
+ refresh_site_worker( site ) if site.url.present? # note: only update if (source) url present
37
+ end
38
+
39
+ total_secs = Time.now - start_time
40
+ Activity.create!( text: "done update sites (#{Site.count}) in #{total_secs}s" )
41
+ end
42
+
43
+
44
+ private
45
+ def refresh_site_worker( site_rec )
46
+ site_text = @worker.fetch( site_rec )
47
+
48
+ # on error or if http-not modified etc. skip update/processing
49
+ return if site_text.nil?
50
+
51
+ site_config = INI.load( site_text )
52
+
53
+ site_updater = SiteUpdater.new
54
+ site_updater.debug = debug? ? true : false # pass along debug flag
55
+
56
+ ### todo/fix:
57
+ ## allow passing in as first arg database rec!!!
58
+ ## - if passed in database rec - do NOT lookup record by site_config.key!!!
59
+ ## use existing key (lets you change/update key without creating a new duplicate site entry, for example)
60
+ ## - or use a new method (instead of overloading arg) ?? - why? why not??
61
+ site_updater.update_subscriptions_for( site_rec.key, site_config )
62
+ end
63
+
64
+ end # class SiteRefresher
65
+
66
+ end # module Pluto
67
+
@@ -3,10 +3,9 @@
3
3
 
4
4
  module Pluto
5
5
 
6
- class Subscriber
6
+ class SiteUpdater
7
7
 
8
8
  include LogUtils::Logging
9
-
10
9
  include Models
11
10
 
12
11
  def debug=(value) @debug = value; end
@@ -156,8 +155,8 @@ class Subscriber
156
155
  site_rec.subscriptions.create!( feed_id: feed_rec.id )
157
156
  end
158
157
 
159
- end # method update_subscriptions
158
+ end # method update_subscriptions_for
160
159
 
161
- end # class Subscriber
160
+ end # class SiteUpdater
162
161
 
163
162
  end # module Pluto
@@ -4,7 +4,7 @@
4
4
  module PlutoUpdate
5
5
 
6
6
  MAJOR = 1
7
- MINOR = 4
7
+ MINOR = 5
8
8
  PATCH = 0
9
9
  VERSION = [MAJOR,MINOR,PATCH].join('.')
10
10
 
@@ -0,0 +1,18 @@
1
+ title = Planet Ruby
2
+ source = https://github.com/feedreader/pluto.update/raw/master/test/data/ruby.ini
3
+
4
+ [rubylang]
5
+ title = Ruby Lang News
6
+ link = http://www.ruby-lang.org/en/news
7
+ feed = http://www.ruby-lang.org/en/feeds/news.rss
8
+
9
+ [rubyonrails]
10
+ title = Ruby on Rails News
11
+ link = http://weblog.rubyonrails.org
12
+ feed = http://weblog.rubyonrails.org/feed/atom.xml
13
+
14
+ [viennarb]
15
+ title = Vienna.rb News
16
+ link = http://vienna-rb.at
17
+ feed = http://vienna-rb.at/atom.xml
18
+
@@ -0,0 +1,25 @@
1
+ ## $:.unshift(File.dirname(__FILE__))
2
+
3
+
4
+ ## minitest setup
5
+
6
+ require 'minitest/autorun'
7
+
8
+ ## our own code
9
+ require 'pluto/update'
10
+
11
+
12
+ LogUtils::Logger.root.level = :debug
13
+
14
+
15
+ ## some shortcuts
16
+ Log = LogDb::Model::Log
17
+ Prop = ConfDb::Model::Prop
18
+
19
+ Site = Pluto::Model::Site
20
+ Feed = Pluto::Model::Feed
21
+ Item = Pluto::Model::Item
22
+ Subscription = Pluto::Model::Subscription
23
+
24
+
25
+ Pluto.setup_in_memory_db
@@ -0,0 +1,38 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_refresh.rb
6
+ # or better
7
+ # rake test
8
+
9
+ require 'helper'
10
+
11
+ class TestRefresh < MiniTest::Test
12
+
13
+ def setup
14
+ Site.delete_all
15
+ Feed.delete_all
16
+ Item.delete_all
17
+ Subscription.delete_all
18
+
19
+ site_text = File.read( "#{PlutoUpdate.root}/test/data/ruby.ini")
20
+ site_config = INI.load( site_text )
21
+
22
+ site_updater = Pluto::SiteUpdater.new
23
+ site_updater.update_subscriptions_for( 'ruby', site_config )
24
+ end
25
+
26
+
27
+ def test_refresh_sites
28
+ Pluto.refresh_sites
29
+ assert true
30
+ end
31
+
32
+ def test_refresh_feeds
33
+ Pluto.refresh_feeds
34
+ assert true
35
+ end
36
+
37
+ end # class TestRefresh
38
+
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_site.rb
6
+ # or better
7
+ # rake test
8
+
9
+ require 'helper'
10
+
11
+ class TestSite < MiniTest::Test
12
+
13
+ def setup
14
+ Site.delete_all
15
+ Feed.delete_all
16
+ Item.delete_all
17
+ Subscription.delete_all
18
+ end
19
+
20
+
21
+ def test_site_updater
22
+ site_text = File.read( "#{PlutoUpdate.root}/test/data/ruby.ini")
23
+ site_config = INI.load( site_text )
24
+ pp site_config
25
+
26
+ assert_equal 0, Site.count
27
+ assert_equal 0, Feed.count
28
+
29
+ site_updater = Pluto::SiteUpdater.new
30
+ site_updater.update_subscriptions_for( 'ruby', site_config )
31
+
32
+ assert_equal 1, Site.count
33
+ assert_equal 3, Feed.count
34
+
35
+ ruby = Site.find_by_key!( 'ruby' )
36
+ assert_equal 'Planet Ruby', ruby.title
37
+ assert_equal 3, ruby.subscriptions.count
38
+ assert_equal 3, ruby.feeds.count
39
+
40
+ rubylang = Feed.find_by_key!( 'rubylang' )
41
+ assert_equal 'Ruby Lang News', rubylang.title
42
+ assert_equal 'http://www.ruby-lang.org/en/news', rubylang.url
43
+ assert_equal 'http://www.ruby-lang.org/en/feeds/news.rss', rubylang.feed_url
44
+ end
45
+
46
+ end # class TestSite
47
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pluto-update
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-09 00:00:00.000000000 Z
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pluto-models
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: pluto-feedfetcher
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.0
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: fetcher
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -89,16 +103,21 @@ extra_rdoc_files:
89
103
  - Manifest.txt
90
104
  - README.md
91
105
  files:
106
+ - ".gemtest"
92
107
  - HISTORY.md
93
108
  - Manifest.txt
94
109
  - README.md
95
110
  - Rakefile
96
111
  - lib/pluto/update.rb
97
- - lib/pluto/update/fetcher.rb
98
- - lib/pluto/update/refresher.rb
99
- - lib/pluto/update/subscriber.rb
100
- - lib/pluto/update/updater.rb
112
+ - lib/pluto/update/feed_refresher.rb
113
+ - lib/pluto/update/site_fetcher.rb
114
+ - lib/pluto/update/site_refresher.rb
115
+ - lib/pluto/update/site_updater.rb
101
116
  - lib/pluto/update/version.rb
117
+ - test/data/ruby.ini
118
+ - test/helper.rb
119
+ - test/test_refresh.rb
120
+ - test/test_site.rb
102
121
  homepage: https://github.com/feedreader/pluto.update
103
122
  licenses:
104
123
  - Public Domain
@@ -125,4 +144,6 @@ rubygems_version: 2.4.2
125
144
  signing_key:
126
145
  specification_version: 4
127
146
  summary: pluto-update - planet feed 'n' subscription updater
128
- test_files: []
147
+ test_files:
148
+ - test/test_site.rb
149
+ - test/test_refresh.rb
@@ -1,357 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Pluto
5
-
6
- class Fetcher
7
-
8
- include LogUtils::Logging
9
-
10
- include Models # for easy convenience access for Activity etc.
11
-
12
- def initialize
13
- @worker = ::Fetcher::Worker.new
14
- end
15
-
16
- def debug=(value) @debug = value; end
17
- def debug?() @debug || false; end
18
-
19
-
20
- def fetch_feed( url )
21
- response = @worker.get( url )
22
-
23
- ## if debug?
24
- puts "http status #{response.code} #{response.message}"
25
-
26
- puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
27
- puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
28
- puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
29
- ## end
30
-
31
- xml = response.body
32
-
33
- ###
34
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
35
- # will mostly be ASCII
36
- # - try to change encoding to UTF-8 ourselves
37
- logger.debug "xml.encoding.name (before): #{xml.encoding.name}"
38
-
39
- #####
40
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
41
-
42
- ## NB:
43
- # for now "hardcoded" to utf8 - what else can we do?
44
- # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
45
- xml = xml.force_encoding( Encoding::UTF_8 )
46
- logger.debug "xml.encoding.name (after): #{xml.encoding.name}"
47
-
48
- xml
49
- end
50
-
51
-
52
- def feed_by_rec( feed_rec )
53
- # simple feed fetcher; use for debugging (only/mostly)
54
- # -- will NOT change db records in any way
55
-
56
- feed_url = feed_rec.feed_url
57
- feed_key = feed_rec.key
58
-
59
- feed_xml = fetch_feed( feed_url )
60
-
61
- logger.debug "feed_xml:"
62
- logger.debug feed_xml[ 0..500 ] # get first 500 chars
63
-
64
- # if opts.verbose? # also write a copy to disk
65
- if debug?
66
- logger.debug "saving feed to >./#{feed_key}.xml<..."
67
- File.open( "./#{feed_key}.xml", 'w' ) do |f|
68
- f.write( feed_xml )
69
- end
70
- end
71
-
72
- puts "Before parsing feed >#{feed_key}<..."
73
-
74
- ## fix/todo: check for feed.nil? -> error parsing!!!
75
- # or throw exception
76
- feed = FeedUtils::Parser.parse( feed_xml )
77
- feed
78
- end
79
-
80
-
81
- def feed_by_rec_if_modified( feed_rec ) # try smart http update; will update db records
82
- feed_url = feed_rec.feed_url
83
- feed_key = feed_rec.key
84
-
85
- ### todo/fix: normalize/unifiy feed_url
86
- ## - same in fetcher - use shared utitlity method or similar
87
-
88
- @worker.use_cache = true
89
- @worker.cache[ feed_url ] = {
90
- 'etag' => feed_rec.http_etag,
91
- 'last-modified' => feed_rec.http_last_modified
92
- }
93
-
94
- begin
95
- response = @worker.get( feed_url )
96
- rescue SocketError => e
97
- ## catch socket error for unknown domain names (e.g. pragdave.blogs.pragprog.com)
98
- ### will result in SocketError -- getaddrinfo: Name or service not known
99
- puts "*** error: fetching feed '#{feed_key}' - #{e.to_s}"
100
- Activity.create!( text: "*** error: fetching feed '#{feed_key}' - #{e.to_s}" )
101
-
102
- ### todo/fix: update feed rec in db
103
- @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
104
- return nil
105
- end
106
-
107
- @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
108
-
109
- if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
110
- puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
111
- puts "no change; request returns not modified (304); skipping parsing feed"
112
- return nil # no updates available; nothing to do
113
- end
114
-
115
- feed_fetched = Time.now
116
-
117
- if response.code != '200' # note Net::HTTP response.code is a string in ruby
118
-
119
- puts "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
120
-
121
- feed_attribs = {
122
- http_code: response.code.to_i,
123
- http_server: response.header[ 'server' ],
124
- http_etag: nil,
125
- http_last_modified: nil,
126
- body: nil,
127
- md5: nil,
128
- fetched: feed_fetched
129
- }
130
- feed_rec.update_attributes!( feed_attribs )
131
-
132
- ## add log error activity -- in future add to error log - better - why? why not?
133
- Activity.create!( text: "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}" )
134
-
135
- return nil # sorry; no feed for parsing available
136
- end
137
-
138
- puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
139
-
140
- feed_xml = response.body
141
- ###
142
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
143
- # will mostly be ASCII
144
- # - try to change encoding to UTF-8 ourselves
145
- logger.debug "feed_xml.encoding.name (before): #{feed_xml.encoding.name}"
146
-
147
-
148
- #####
149
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
150
-
151
- # try Converting ASCII-8BIT to UTF-8 based domain-specific guesses
152
- begin
153
- # Try it as UTF-8 directly
154
- # Note: make a copy/dup - otherwise convert fails (because string is already changed/corrupted)
155
- feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::UTF_8 )
156
- unless feed_xml_cleaned.valid_encoding?
157
-
158
- puts "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1"
159
- Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1" )
160
- # Some of it might be old Windows code page
161
- # -- (Windows Code Page CP1252 is ISO_8859_1 is Latin-1 - check ??)
162
-
163
- # tell ruby the encoding
164
- # encode to utf-8
165
- ## use all in code encode ?? e.g. feed_xml_cleaned = feed_xml.encode( Encoding::UTF_8, Encoding::ISO_8859_1 )
166
- feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::ISO_8859_1 ).encode( Encoding::UTF_8 )
167
- end
168
- feed_xml = feed_xml_cleaned
169
- rescue EncodingError => e
170
- puts "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}"
171
- Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}" )
172
-
173
- # Force it to UTF-8, throwing out invalid bits
174
- ## todo: check options - add ?? or something to mark invalid chars ???
175
- feed_xml.encode!( Encoding::UTF_8, :invalid => :replace, :undef => :replace )
176
- end
177
-
178
- ## NB:
179
- # for now "hardcoded" to utf8 - what else can we do?
180
- # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
181
- ### old "simple" version
182
- ## feed_xml = feed_xml.force_encoding( Encoding::UTF_8 )
183
-
184
-
185
- logger.debug "feed_xml.encoding.name (after): #{feed_xml.encoding.name}"
186
-
187
- ## check for md5 hash for response.body
188
-
189
- last_feed_md5 = feed_rec.md5
190
- feed_md5 = Digest::MD5.hexdigest( feed_xml )
191
-
192
- if last_feed_md5 && last_feed_md5 == feed_md5
193
- # not all servers handle conditional gets, so while not much can be
194
- # done about the bandwidth, but if the response body is identical
195
- # the downstream processing (parsing, caching, ...) can be avoided.
196
- # - thanks to planet mars -fido.rb for the idea, cheers.
197
-
198
- puts "no change; md5 digests match; skipping parsing feed"
199
- return nil # no updates available; nothing to do
200
- end
201
-
202
- feed_attribs = {
203
- http_code: response.code.to_i,
204
- http_server: response.header[ 'server' ],
205
- http_etag: response.header[ 'etag' ],
206
- http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
207
- body: feed_xml,
208
- md5: feed_md5,
209
- fetched: feed_fetched
210
- }
211
-
212
- ## if debug?
213
- puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
214
- puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
215
- puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
216
- ## end
217
-
218
- ### note: might crash w/ encoding errors when saving in postgress
219
- ## e.g. PG::CharacterNotInRepertoire: ERROR: ...
220
- ## catch error, log it and stop for now
221
- #
222
- # in the future check for different charset than utf-8 ?? possible?? how to deal with non-utf8 charsets??
223
-
224
- begin
225
- feed_rec.update_attributes!( feed_attribs )
226
- rescue Exception => e
227
- # log db error; and continue
228
- puts "*** error: updating feed database record '#{feed_key}' - #{e.to_s}"
229
- Activity.create!( text: "*** error: updating feed database record '#{feed_key}' - #{e.to_s}" )
230
- return nil # sorry; corrupt feed; parsing not possible; fix char encoding - make it an option in config??
231
- end
232
-
233
-
234
- logger.debug "feed_xml:"
235
- logger.debug feed_xml[ 0..300 ] # get first 300 chars
236
-
237
- puts "Before parsing feed >#{feed_key}<..."
238
-
239
- ### move to feedutils
240
- ### logger.debug "using stdlib RSS::VERSION #{RSS::VERSION}"
241
-
242
- ## fix/todo: check for feed.nil? -> error parsing!!!
243
- # or throw exception
244
- feed = FeedUtils::Parser.parse( feed_xml )
245
- feed
246
- end
247
-
248
-
249
- def site_by_rec_if_modified( site_rec ) # try smart http update; will update db records
250
- site_url = site_rec.url
251
- site_key = site_rec.key
252
-
253
- ### todo/fix: normalize/unifiy feed_url
254
- ## - same in fetcher - use shared utitlity method or similar
255
-
256
- @worker.use_cache = true
257
- @worker.cache[ site_url ] = {
258
- 'etag' => site_rec.http_etag,
259
- 'last-modified' => site_rec.http_last_modified
260
- }
261
-
262
- response = @worker.get( site_url )
263
- @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
264
-
265
- site_fetched = Time.now
266
-
267
- ###
268
- # Note: Net::HTTP will NOT set encoding UTF-8 etc.
269
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
270
- # thus, set/force encoding to utf-8
271
- site_text = response.body.to_s
272
- site_text = site_text.force_encoding( Encoding::UTF_8 )
273
-
274
- if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
275
-
276
- if site_text.index('@include')
277
- ## note: if the site_text includes @include
278
- ## we must revalidate complete file hierachy(tree) for now
279
- ### continue;
280
- ##
281
- ## fix/todo: use ahead-of-time preprocessor ?? in the future to simplify???
282
- else
283
- puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
284
- puts "no change; request returns not modified (304); skipping parsing site config"
285
- return nil # no updates available; nothing to do
286
- end
287
-
288
- elsif response.code != '200' # note Net::HTTP response.code is a string in ruby
289
-
290
- puts "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
291
-
292
- site_attribs = {
293
- http_code: response.code.to_i,
294
- http_server: response.header[ 'server' ],
295
- http_etag: nil,
296
- http_last_modified: nil,
297
- fetched: site_fetched
298
- }
299
- site_rec.update_attributes!( site_attribs )
300
-
301
- ## add log error activity -- in future add to error log - better - why? why not?
302
- Activity.create!( text: "*** error: fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}" )
303
-
304
- return nil # sorry; no feed for parsing available
305
- else
306
- # assume 200; continue w/ processing
307
- end
308
-
309
- puts "OK - fetching site '#{site_key}' - HTTP status #{response.code} #{response.message}"
310
-
311
- site_attribs = {
312
- http_code: response.code.to_i,
313
- http_server: response.header[ 'server' ],
314
- http_etag: response.header[ 'etag' ],
315
- http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
316
- fetched: site_fetched
317
- }
318
-
319
- ## if debug?
320
- puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
321
- puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
322
- puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
323
- ## end
324
-
325
- site_rec.update_attributes!( site_attribs )
326
-
327
-
328
- #################
329
- ### fix: add support for http_etag cache etc. - how??
330
- ###
331
- ### use from_text( text, base: base ) !!!!!!!!
332
- ### do NOT reissue first request
333
- ##
334
- ## fix: use special case/method for update_with_includes!!!
335
- ## keep it simple w/o includes (do NOT mix in one method)
336
- ## split into two methods!!!
337
-
338
- ## retry w/ preprocesser
339
- ## refetch if @include found w/ all includes included
340
- if site_text.index('@include')
341
- site_text = InclPreproc.from_url( site_url ).read
342
- end
343
-
344
- ## logger.debug "site_text:"
345
- ## logger.debug site_text[ 0..300 ] # get first 300 chars
346
-
347
-
348
- puts "Before parsing site config >#{site_key}<..."
349
-
350
- # assume ini format for now
351
- site_config = INI.load( site_text )
352
- site_config
353
- end
354
-
355
- end # class Fetcher
356
-
357
- end # module Pluto
@@ -1,62 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Pluto
5
-
6
- class Updater
7
-
8
- include LogUtils::Logging
9
-
10
- ### fix!!!!!: change config to text - yes/no - why? why not??
11
- # or pass along struct
12
- # - with hash and text and format(e.g. ini) as fields???
13
- #
14
- # - why? - we need to get handle on md5 digest/hash plus on plain text, ideally to store in db
15
- ## - pass along unparsed text!! - not hash struct
16
- # - will get saved in db plus we need to generate md5 hash
17
- # - add filename e.g. ruby.ini|ruby.conf as opt ??
18
- # or add config format as opt e.g. ini?
19
-
20
- def initialize( opts, config )
21
- @opts = opts
22
- @config = config
23
- end
24
-
25
- attr_reader :opts, :config
26
-
27
- def run( arg )
28
- arg = arg.downcase.gsub('.ini','') # remove file extension if present
29
-
30
- update_for( arg )
31
- end
32
-
33
- def update_for( name )
34
-
35
- ## note: allow (optional) config of site key too
36
- site_key = config['key'] || config['slug']
37
- if site_key.nil?
38
- ## if no key configured; use (file)name; remove -_ chars
39
- ## e.g. jekyll-meta becomes jekyllmeta etc.
40
- site_key = name.downcase.gsub( /[\-_]/, '' )
41
- end
42
-
43
- ###################
44
- # step 1) update subscriptions
45
- subscriber = Subscriber.new
46
-
47
- # pass along debug/verbose setting/switch
48
- subscriber.debug = true if opts.verbose?
49
- subscriber.update_subscriptions_for( site_key, config )
50
-
51
- ##############################
52
- # step 2) update feeds
53
- refresher = Refresher.new
54
-
55
- # pass along debug/verbose setting/switch
56
- refresher.debug = true if opts.verbose?
57
- refresher.update_feeds_for( site_key )
58
- end # method run
59
-
60
- end # class Updater
61
-
62
- end # module Pluto