pluto-feedfetcher 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8f4eed6539db8183401c2da47b91e42ecc59aaa3
4
+ data.tar.gz: 4cbf9edd2c859a5b6b643e901789b3c84e0dce9d
5
+ SHA512:
6
+ metadata.gz: 493a4e41cdc0628aab781058f442a316d254a358925893e3f17db4e1be2221472b4199351e485390cb4844e0b14b9e495f42746bd9bd43324c7f2bbff9b4bdd5
7
+ data.tar.gz: d199ce23f2b41e4cd140c4b23e82193bd1fa7adcf8f92e2ccd02e325865b514ad3fd3cea9ba2c952c8ecd1fc90ebe1b1a7d823ec1189d6dec532440b36aec1ff
data/HISTORY.md ADDED
@@ -0,0 +1,5 @@
1
+
2
+ ### 0.1.0 / 2015-01-11
3
+
4
+ * Everything is new. First release
5
+
data/Manifest.txt ADDED
@@ -0,0 +1,8 @@
1
+ HISTORY.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ lib/pluto/feedfetcher.rb
6
+ lib/pluto/feedfetcher/basic.rb
7
+ lib/pluto/feedfetcher/cond_get_with_cache.rb
8
+ lib/pluto/feedfetcher/version.rb
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ # pluto-feedfetcher gem - fetch web feeds (w/ conditional HTTP get e.g. use etags, if-modified-since etc.)
2
+
3
+ * home :: [github.com/feedreader/pluto.feed.fetcher](https://github.com/feedreader/pluto.feed.fetcher)
4
+ * bugs :: [github.com/feedreader/pluto.feed.fetcher/issues](https://github.com/feedreader/pluto.feed.fetcher/issues)
5
+ * gem :: [rubygems.org/gems/pluto-feedfetcher](https://rubygems.org/gems/pluto-feedfetcher)
6
+ * rdoc :: [rubydoc.info/gems/pluto-feedfetcher](http://rubydoc.info/gems/pluto-feedfetcher)
7
+ * forum :: [groups.google.com/group/feedreader](http://groups.google.com/group/feedreader)
8
+
9
+
10
+
11
+ ## Usage
12
+
13
+ TBD
14
+
15
+
16
+ ## License
17
+
18
+ The `pluto-feedfetcher` scripts are dedicated to the public domain.
19
+ Use it as you please with no restrictions whatsoever.
20
+
21
+ ## Questions? Comments?
22
+
23
+ Send them along to the [Planet Pluto and Friends Forum/Mailing List](http://groups.google.com/group/feedreader).
24
+ Thanks!
25
+
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ require 'hoe'
2
+ require './lib/pluto/feedfetcher/version.rb'
3
+
4
+ Hoe.spec 'pluto-feedfetcher' do
5
+
6
+ self.version = PlutoFeedFetcher::VERSION
7
+
8
+ self.summary = "pluto-feedfetcher - fetch web feeds (w/ conditional HTTP get e.g. use etags, if-modified-since etc.)"
9
+ self.description = summary
10
+
11
+ self.urls = ['https://github.com/feedreader/pluto.feed.fetcher']
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'feedreader@googlegroups.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'HISTORY.md'
19
+
20
+ self.extra_deps = [
21
+ ['pluto-models', '>= 1.3.2'],
22
+ ['fetcher', '>= 0.4.4'],
23
+ ]
24
+
25
+ self.licenses = ['Public Domain']
26
+
27
+ self.spec_extras = {
28
+ required_ruby_version: '>= 1.9.2'
29
+ }
30
+
31
+ end
@@ -0,0 +1,93 @@
1
+ # encoding: utf-8
2
+
3
+ module Pluto
4
+
5
+ class FeedFetcherBasic
6
+
7
+ include LogUtils::Logging
8
+ include Models # for easy convenience access for Activity etc.
9
+
10
+ def initialize
11
+ @worker = Fetcher::Worker.new
12
+ end
13
+
14
+ def debug=(value) @debug = value; end
15
+ def debug?() @debug || false; end
16
+
17
+
18
+ def fetch( feed_rec )
19
+ # simple/basic feed fetcher; use for debugging (only/mostly)
20
+ # -- Note: will NOT change db records in any way
21
+
22
+ feed_url = feed_rec.feed_url
23
+ feed_key = feed_rec.key
24
+
25
+ feed_xml = fix_me_fetch_utf8( feed_url )
26
+
27
+ logger.debug "feed_xml:"
28
+ logger.debug feed_xml[ 0..500 ] # get first 500 chars
29
+
30
+
31
+ #### todo/fix: make it generic - move out of this method (re)use - for all fetcher??
32
+ # if opts.verbose? # also write a copy to disk
33
+ if debug?
34
+ logger.debug "saving feed to >./#{feed_key}.xml<..."
35
+ File.open( "./#{feed_key}.xml", 'w' ) do |f|
36
+ f.write( feed_xml )
37
+ end
38
+ end
39
+
40
+
41
+
42
+ ### todo/fix:
43
+ ### return feed_xml !!! - move FeedUtils::Parser.parse to update or someting !!!
44
+
45
+ puts "Before parsing feed >#{feed_key}<..."
46
+
47
+
48
+ feed_xml
49
+
50
+ ## fix/todo: check for feed.nil? -> error parsing!!!
51
+ # or throw exception
52
+ # feed = FeedUtils::Parser.parse( feed_xml )
53
+ # feed
54
+ end
55
+
56
+
57
+ ###########
58
+ # todo/fix: use "standard" fetch method e.g. Fetcher.read_utf8!() - clean up/remove (duplicate) here??
59
+ private
60
+ def fix_me_fetch_utf8( url )
61
+ response = @worker.get( url )
62
+
63
+ ## if debug?
64
+ puts "http status #{response.code} #{response.message}"
65
+
66
+ puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
67
+ puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
68
+ puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
69
+ ## end
70
+
71
+ xml = response.body
72
+
73
+ ###
74
+ # NB: Net::HTTP will NOT set encoding UTF-8 etc.
75
+ # will mostly be ASCII
76
+ # - try to change encoding to UTF-8 ourselves
77
+ logger.debug "xml.encoding.name (before): #{xml.encoding.name}"
78
+
79
+ #####
80
+ # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
81
+
82
+ ## NB:
83
+ # for now "hardcoded" to utf8 - what else can we do?
84
+ # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
85
+ xml = xml.force_encoding( Encoding::UTF_8 )
86
+ logger.debug "xml.encoding.name (after): #{xml.encoding.name}"
87
+
88
+ xml
89
+ end
90
+
91
+ end # class FeedFetcherBasic
92
+
93
+ end # module Pluto
@@ -0,0 +1,196 @@
1
+ # encoding: utf-8
2
+
3
+ module Pluto
4
+
5
+ class FeedFetcherCondGetWithCache
6
+
7
+ include LogUtils::Logging
8
+ include Models # for easy convenience access for Activity etc.
9
+
10
+ def initialize
11
+ @worker = Fetcher::Worker.new
12
+ end
13
+
14
+ def debug=(value) @debug = value; end
15
+ def debug?() @debug || false; end
16
+
17
+
18
+ def fetch( feed_rec )
19
+ #############
20
+ # try smart http update; will update db records
21
+
22
+ feed_url = feed_rec.feed_url
23
+ feed_key = feed_rec.key
24
+
25
+ ### todo/fix: normalize/unifiy feed_url
26
+ ## - same in fetcher - use shared utitlity method or similar
27
+
28
+ @worker.use_cache = true
29
+ @worker.cache[ feed_url ] = {
30
+ 'etag' => feed_rec.http_etag,
31
+ 'last-modified' => feed_rec.http_last_modified
32
+ }
33
+
34
+ begin
35
+ response = @worker.get( feed_url )
36
+ rescue SocketError => e
37
+ ## catch socket error for unknown domain names (e.g. pragdave.blogs.pragprog.com)
38
+ ### will result in SocketError -- getaddrinfo: Name or service not known
39
+ puts "*** error: fetching feed '#{feed_key}' - #{e.to_s}"
40
+ Activity.create!( text: "*** error: fetching feed '#{feed_key}' - #{e.to_s}" )
41
+
42
+ ### todo/fix: update feed rec in db
43
+ @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
44
+ return nil
45
+ end
46
+
47
+ @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
48
+
49
+ if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
50
+ puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
51
+ puts "no change; request returns not modified (304); skipping parsing feed"
52
+ return nil # no updates available; nothing to do
53
+ end
54
+
55
+ feed_fetched = Time.now
56
+
57
+ if response.code != '200' # note Net::HTTP response.code is a string in ruby
58
+
59
+ puts "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
60
+
61
+ feed_attribs = {
62
+ http_code: response.code.to_i,
63
+ http_server: response.header[ 'server' ],
64
+ http_etag: nil,
65
+ http_last_modified: nil,
66
+ body: nil,
67
+ md5: nil,
68
+ fetched: feed_fetched
69
+ }
70
+ feed_rec.update_attributes!( feed_attribs )
71
+
72
+ ## add log error activity -- in future add to error log - better - why? why not?
73
+ Activity.create!( text: "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}" )
74
+
75
+ return nil # sorry; no feed for parsing available
76
+ end
77
+
78
+ puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
79
+
80
+ feed_xml = response.body
81
+ ###
82
+ # NB: Net::HTTP will NOT set encoding UTF-8 etc.
83
+ # will mostly be ASCII
84
+ # - try to change encoding to UTF-8 ourselves
85
+ logger.debug "feed_xml.encoding.name (before): #{feed_xml.encoding.name}"
86
+
87
+
88
+ #####
89
+ # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
90
+
91
+ # try Converting ASCII-8BIT to UTF-8 based domain-specific guesses
92
+ begin
93
+ # Try it as UTF-8 directly
94
+ # Note: make a copy/dup - otherwise convert fails (because string is already changed/corrupted)
95
+ feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::UTF_8 )
96
+ unless feed_xml_cleaned.valid_encoding?
97
+
98
+ puts "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1"
99
+ Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1" )
100
+ # Some of it might be old Windows code page
101
+ # -- (Windows Code Page CP1252 is ISO_8859_1 is Latin-1 - check ??)
102
+
103
+ # tell ruby the encoding
104
+ # encode to utf-8
105
+ ## use all in code encode ?? e.g. feed_xml_cleaned = feed_xml.encode( Encoding::UTF_8, Encoding::ISO_8859_1 )
106
+ feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::ISO_8859_1 ).encode( Encoding::UTF_8 )
107
+ end
108
+ feed_xml = feed_xml_cleaned
109
+ rescue EncodingError => e
110
+ puts "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}"
111
+ Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}" )
112
+
113
+ # Force it to UTF-8, throwing out invalid bits
114
+ ## todo: check options - add ?? or something to mark invalid chars ???
115
+ feed_xml.encode!( Encoding::UTF_8, :invalid => :replace, :undef => :replace )
116
+ end
117
+
118
+ ## NB:
119
+ # for now "hardcoded" to utf8 - what else can we do?
120
+ # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
121
+ ### old "simple" version
122
+ ## feed_xml = feed_xml.force_encoding( Encoding::UTF_8 )
123
+
124
+
125
+ logger.debug "feed_xml.encoding.name (after): #{feed_xml.encoding.name}"
126
+
127
+ ## check for md5 hash for response.body
128
+
129
+ last_feed_md5 = feed_rec.md5
130
+ feed_md5 = Digest::MD5.hexdigest( feed_xml )
131
+
132
+ if last_feed_md5 && last_feed_md5 == feed_md5
133
+ # not all servers handle conditional gets, so while not much can be
134
+ # done about the bandwidth, but if the response body is identical
135
+ # the downstream processing (parsing, caching, ...) can be avoided.
136
+ # - thanks to planet mars -fido.rb for the idea, cheers.
137
+
138
+ puts "no change; md5 digests match; skipping parsing feed"
139
+ return nil # no updates available; nothing to do
140
+ end
141
+
142
+ feed_attribs = {
143
+ http_code: response.code.to_i,
144
+ http_server: response.header[ 'server' ],
145
+ http_etag: response.header[ 'etag' ],
146
+ http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
147
+ body: feed_xml,
148
+ md5: feed_md5,
149
+ fetched: feed_fetched
150
+ }
151
+
152
+ ## if debug?
153
+ puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
154
+ puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
155
+ puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
156
+ ## end
157
+
158
+ ### note: might crash w/ encoding errors when saving in postgress
159
+ ## e.g. PG::CharacterNotInRepertoire: ERROR: ...
160
+ ## catch error, log it and stop for now
161
+ #
162
+ # in the future check for different charset than utf-8 ?? possible?? how to deal with non-utf8 charsets??
163
+
164
+ begin
165
+ feed_rec.update_attributes!( feed_attribs )
166
+ rescue Exception => e
167
+ # log db error; and continue
168
+ puts "*** error: updating feed database record '#{feed_key}' - #{e.to_s}"
169
+ Activity.create!( text: "*** error: updating feed database record '#{feed_key}' - #{e.to_s}" )
170
+ return nil # sorry; corrupt feed; parsing not possible; fix char encoding - make it an option in config??
171
+ end
172
+
173
+
174
+ logger.debug "feed_xml:"
175
+ logger.debug feed_xml[ 0..300 ] # get first 300 chars
176
+
177
+ puts "Before parsing feed >#{feed_key}<..."
178
+
179
+ ### move to feedutils
180
+ ### logger.debug "using stdlib RSS::VERSION #{RSS::VERSION}"
181
+
182
+
183
+ ### todo/fix:
184
+ ### return feed_xml !!! - move FeedUtils::Parser.parse to update or someting !!!
185
+
186
+ feed_xml
187
+ ## fix/todo: check for feed.nil? -> error parsing!!!
188
+ # or throw exception
189
+ ## feed = FeedUtils::Parser.parse( feed_xml )
190
+ ## feed
191
+ end
192
+
193
+
194
+ end # class FeedFetcherCondGetWithCache
195
+
196
+ end # module Pluto
@@ -0,0 +1,24 @@
1
+ # encoding: utf-8
2
+
3
+ module PlutoFeedFetcher
4
+
5
+ MAJOR = 0
6
+ MINOR = 1
7
+ PATCH = 0
8
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
9
+
10
+ def self.version
11
+ VERSION
12
+ end
13
+
14
+ def self.banner
15
+ ### todo: add RUBY_PATCHLEVEL or RUBY_PATCH_LEVEL e.g. -p124
16
+ "pluto-feedfetcher/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
17
+ end
18
+
19
+ def self.root
20
+ "#{File.expand_path( File.dirname(File.dirname(File.dirname(File.dirname(__FILE__)))) )}"
21
+ end
22
+
23
+ end # module PlutoFeedFetcher
24
+
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'pluto/models'
5
+
6
+
7
+ # more 3rd party gems
8
+ require 'fetcher' # fetch (text) documents
9
+
10
+
11
+ # our own code
12
+ require 'pluto/feedfetcher/version' # Note: let version always go first
13
+ require 'pluto/feedfetcher/basic'
14
+ require 'pluto/feedfetcher/cond_get_with_cache'
15
+
16
+
17
+
18
+ # say hello
19
+ puts PlutoFeedFetcher.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
20
+
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pluto-feedfetcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Gerald Bauer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pluto-models
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 1.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: fetcher
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: rdoc
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '4.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '4.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: hoe
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.13'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.13'
69
+ description: pluto-feedfetcher - fetch web feeds (w/ conditional HTTP get e.g. use
70
+ etags, if-modified-since etc.)
71
+ email: feedreader@googlegroups.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files:
75
+ - HISTORY.md
76
+ - Manifest.txt
77
+ - README.md
78
+ files:
79
+ - HISTORY.md
80
+ - Manifest.txt
81
+ - README.md
82
+ - Rakefile
83
+ - lib/pluto/feedfetcher.rb
84
+ - lib/pluto/feedfetcher/basic.rb
85
+ - lib/pluto/feedfetcher/cond_get_with_cache.rb
86
+ - lib/pluto/feedfetcher/version.rb
87
+ homepage: https://github.com/feedreader/pluto.feed.fetcher
88
+ licenses:
89
+ - Public Domain
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options:
93
+ - "--main"
94
+ - README.md
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.9.2
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.4.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: pluto-feedfetcher - fetch web feeds (w/ conditional HTTP get e.g. use etags,
113
+ if-modified-since etc.)
114
+ test_files: []