pluto-feedfetcher 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8f4eed6539db8183401c2da47b91e42ecc59aaa3
4
+ data.tar.gz: 4cbf9edd2c859a5b6b643e901789b3c84e0dce9d
5
+ SHA512:
6
+ metadata.gz: 493a4e41cdc0628aab781058f442a316d254a358925893e3f17db4e1be2221472b4199351e485390cb4844e0b14b9e495f42746bd9bd43324c7f2bbff9b4bdd5
7
+ data.tar.gz: d199ce23f2b41e4cd140c4b23e82193bd1fa7adcf8f92e2ccd02e325865b514ad3fd3cea9ba2c952c8ecd1fc90ebe1b1a7d823ec1189d6dec532440b36aec1ff
data/HISTORY.md ADDED
@@ -0,0 +1,5 @@
1
+
2
+ ### 0.1.0 / 2015-01-11
3
+
4
+ * Everything is new. First release
5
+
data/Manifest.txt ADDED
@@ -0,0 +1,8 @@
1
+ HISTORY.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ lib/pluto/feedfetcher.rb
6
+ lib/pluto/feedfetcher/basic.rb
7
+ lib/pluto/feedfetcher/cond_get_with_cache.rb
8
+ lib/pluto/feedfetcher/version.rb
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ # pluto-feedfetcher gem - fetch web feeds (w/ conditional HTTP get e.g. use etags, if-modified-since etc.)
2
+
3
+ * home :: [github.com/feedreader/pluto.feed.fetcher](https://github.com/feedreader/pluto.feed.fetcher)
4
+ * bugs :: [github.com/feedreader/pluto.feed.fetcher/issues](https://github.com/feedreader/pluto.feed.fetcher/issues)
5
+ * gem :: [rubygems.org/gems/pluto-feedfetcher](https://rubygems.org/gems/pluto-feedfetcher)
6
+ * rdoc :: [rubydoc.info/gems/pluto-feedfetcher](http://rubydoc.info/gems/pluto-feedfetcher)
7
+ * forum :: [groups.google.com/group/feedreader](http://groups.google.com/group/feedreader)
8
+
9
+
10
+
11
+ ## Usage
12
+
13
+ TBD
14
+
15
+
16
+ ## License
17
+
18
+ The `pluto-feedfetcher` scripts are dedicated to the public domain.
19
+ Use it as you please with no restrictions whatsoever.
20
+
21
+ ## Questions? Comments?
22
+
23
+ Send them along to the [Planet Pluto and Friends Forum/Mailing List](http://groups.google.com/group/feedreader).
24
+ Thanks!
25
+
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ require 'hoe'
2
+ require './lib/pluto/feedfetcher/version.rb'
3
+
4
+ Hoe.spec 'pluto-feedfetcher' do
5
+
6
+ self.version = PlutoFeedFetcher::VERSION
7
+
8
+ self.summary = "pluto-feedfetcher - fetch web feeds (w/ conditional HTTP get e.g. use etags, if-modified-since etc.)"
9
+ self.description = summary
10
+
11
+ self.urls = ['https://github.com/feedreader/pluto.feed.fetcher']
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'feedreader@googlegroups.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'HISTORY.md'
19
+
20
+ self.extra_deps = [
21
+ ['pluto-models', '>= 1.3.2'],
22
+ ['fetcher', '>= 0.4.4'],
23
+ ]
24
+
25
+ self.licenses = ['Public Domain']
26
+
27
+ self.spec_extras = {
28
+ required_ruby_version: '>= 1.9.2'
29
+ }
30
+
31
+ end
@@ -0,0 +1,93 @@
1
+ # encoding: utf-8
2
+
3
+ module Pluto
4
+
5
+ class FeedFetcherBasic
6
+
7
+ include LogUtils::Logging
8
+ include Models # for easy convenience access for Activity etc.
9
+
10
+ def initialize
11
+ @worker = Fetcher::Worker.new
12
+ end
13
+
14
+ def debug=(value) @debug = value; end
15
+ def debug?() @debug || false; end
16
+
17
+
18
+ def fetch( feed_rec )
19
+ # simple/basic feed fetcher; use for debugging (only/mostly)
20
+ # -- Note: will NOT change db records in any way
21
+
22
+ feed_url = feed_rec.feed_url
23
+ feed_key = feed_rec.key
24
+
25
+ feed_xml = fix_me_fetch_utf8( feed_url )
26
+
27
+ logger.debug "feed_xml:"
28
+ logger.debug feed_xml[ 0..500 ] # get first 500 chars
29
+
30
+
31
+ #### todo/fix: make it generic - move out of this method (re)use - for all fetcher??
32
+ # if opts.verbose? # also write a copy to disk
33
+ if debug?
34
+ logger.debug "saving feed to >./#{feed_key}.xml<..."
35
+ File.open( "./#{feed_key}.xml", 'w' ) do |f|
36
+ f.write( feed_xml )
37
+ end
38
+ end
39
+
40
+
41
+
42
+ ### todo/fix:
43
+ ### return feed_xml !!! - move FeedUtils::Parser.parse to update or someting !!!
44
+
45
+ puts "Before parsing feed >#{feed_key}<..."
46
+
47
+
48
+ feed_xml
49
+
50
+ ## fix/todo: check for feed.nil? -> error parsing!!!
51
+ # or throw exception
52
+ # feed = FeedUtils::Parser.parse( feed_xml )
53
+ # feed
54
+ end
55
+
56
+
57
+ ###########
58
+ # todo/fix: use "standard" fetch method e.g. Fetcher.read_utf8!() - clean up/remove (duplicate) here??
59
+ private
60
+ def fix_me_fetch_utf8( url )
61
+ response = @worker.get( url )
62
+
63
+ ## if debug?
64
+ puts "http status #{response.code} #{response.message}"
65
+
66
+ puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
67
+ puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
68
+ puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
69
+ ## end
70
+
71
+ xml = response.body
72
+
73
+ ###
74
+ # NB: Net::HTTP will NOT set encoding UTF-8 etc.
75
+ # will mostly be ASCII
76
+ # - try to change encoding to UTF-8 ourselves
77
+ logger.debug "xml.encoding.name (before): #{xml.encoding.name}"
78
+
79
+ #####
80
+ # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
81
+
82
+ ## NB:
83
+ # for now "hardcoded" to utf8 - what else can we do?
84
+ # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
85
+ xml = xml.force_encoding( Encoding::UTF_8 )
86
+ logger.debug "xml.encoding.name (after): #{xml.encoding.name}"
87
+
88
+ xml
89
+ end
90
+
91
+ end # class FeedFetcherBasic
92
+
93
+ end # module Pluto
@@ -0,0 +1,196 @@
1
+ # encoding: utf-8
2
+
3
+ module Pluto
4
+
5
+ class FeedFetcherCondGetWithCache
6
+
7
+ include LogUtils::Logging
8
+ include Models # for easy convenience access for Activity etc.
9
+
10
+ def initialize
11
+ @worker = Fetcher::Worker.new
12
+ end
13
+
14
+ def debug=(value) @debug = value; end
15
+ def debug?() @debug || false; end
16
+
17
+
18
+ def fetch( feed_rec )
19
+ #############
20
+ # try smart http update; will update db records
21
+
22
+ feed_url = feed_rec.feed_url
23
+ feed_key = feed_rec.key
24
+
25
+ ### todo/fix: normalize/unifiy feed_url
26
+ ## - same in fetcher - use shared utitlity method or similar
27
+
28
+ @worker.use_cache = true
29
+ @worker.cache[ feed_url ] = {
30
+ 'etag' => feed_rec.http_etag,
31
+ 'last-modified' => feed_rec.http_last_modified
32
+ }
33
+
34
+ begin
35
+ response = @worker.get( feed_url )
36
+ rescue SocketError => e
37
+ ## catch socket error for unknown domain names (e.g. pragdave.blogs.pragprog.com)
38
+ ### will result in SocketError -- getaddrinfo: Name or service not known
39
+ puts "*** error: fetching feed '#{feed_key}' - #{e.to_s}"
40
+ Activity.create!( text: "*** error: fetching feed '#{feed_key}' - #{e.to_s}" )
41
+
42
+ ### todo/fix: update feed rec in db
43
+ @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
44
+ return nil
45
+ end
46
+
47
+ @worker.use_cache = false # fix/todo: restore old use_cache setting instead of false
48
+
49
+ if response.code == '304' # not modified (conditional GET - e.g. using etag/last-modified)
50
+ puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
51
+ puts "no change; request returns not modified (304); skipping parsing feed"
52
+ return nil # no updates available; nothing to do
53
+ end
54
+
55
+ feed_fetched = Time.now
56
+
57
+ if response.code != '200' # note Net::HTTP response.code is a string in ruby
58
+
59
+ puts "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
60
+
61
+ feed_attribs = {
62
+ http_code: response.code.to_i,
63
+ http_server: response.header[ 'server' ],
64
+ http_etag: nil,
65
+ http_last_modified: nil,
66
+ body: nil,
67
+ md5: nil,
68
+ fetched: feed_fetched
69
+ }
70
+ feed_rec.update_attributes!( feed_attribs )
71
+
72
+ ## add log error activity -- in future add to error log - better - why? why not?
73
+ Activity.create!( text: "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}" )
74
+
75
+ return nil # sorry; no feed for parsing available
76
+ end
77
+
78
+ puts "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
79
+
80
+ feed_xml = response.body
81
+ ###
82
+ # NB: Net::HTTP will NOT set encoding UTF-8 etc.
83
+ # will mostly be ASCII
84
+ # - try to change encoding to UTF-8 ourselves
85
+ logger.debug "feed_xml.encoding.name (before): #{feed_xml.encoding.name}"
86
+
87
+
88
+ #####
89
+ # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
90
+
91
+ # try Converting ASCII-8BIT to UTF-8 based domain-specific guesses
92
+ begin
93
+ # Try it as UTF-8 directly
94
+ # Note: make a copy/dup - otherwise convert fails (because string is already changed/corrupted)
95
+ feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::UTF_8 )
96
+ unless feed_xml_cleaned.valid_encoding?
97
+
98
+ puts "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1"
99
+ Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1" )
100
+ # Some of it might be old Windows code page
101
+ # -- (Windows Code Page CP1252 is ISO_8859_1 is Latin-1 - check ??)
102
+
103
+ # tell ruby the encoding
104
+ # encode to utf-8
105
+ ## use all in code encode ?? e.g. feed_xml_cleaned = feed_xml.encode( Encoding::UTF_8, Encoding::ISO_8859_1 )
106
+ feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::ISO_8859_1 ).encode( Encoding::UTF_8 )
107
+ end
108
+ feed_xml = feed_xml_cleaned
109
+ rescue EncodingError => e
110
+ puts "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}"
111
+ Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}" )
112
+
113
+ # Force it to UTF-8, throwing out invalid bits
114
+ ## todo: check options - add ?? or something to mark invalid chars ???
115
+ feed_xml.encode!( Encoding::UTF_8, :invalid => :replace, :undef => :replace )
116
+ end
117
+
118
+ ## NB:
119
+ # for now "hardcoded" to utf8 - what else can we do?
120
+ # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
121
+ ### old "simple" version
122
+ ## feed_xml = feed_xml.force_encoding( Encoding::UTF_8 )
123
+
124
+
125
+ logger.debug "feed_xml.encoding.name (after): #{feed_xml.encoding.name}"
126
+
127
+ ## check for md5 hash for response.body
128
+
129
+ last_feed_md5 = feed_rec.md5
130
+ feed_md5 = Digest::MD5.hexdigest( feed_xml )
131
+
132
+ if last_feed_md5 && last_feed_md5 == feed_md5
133
+ # not all servers handle conditional gets, so while not much can be
134
+ # done about the bandwidth, but if the response body is identical
135
+ # the downstream processing (parsing, caching, ...) can be avoided.
136
+ # - thanks to planet mars -fido.rb for the idea, cheers.
137
+
138
+ puts "no change; md5 digests match; skipping parsing feed"
139
+ return nil # no updates available; nothing to do
140
+ end
141
+
142
+ feed_attribs = {
143
+ http_code: response.code.to_i,
144
+ http_server: response.header[ 'server' ],
145
+ http_etag: response.header[ 'etag' ],
146
+ http_last_modified: response.header[ 'last-modified' ], ## note: last_modified header gets stored as plain text (not datetime)
147
+ body: feed_xml,
148
+ md5: feed_md5,
149
+ fetched: feed_fetched
150
+ }
151
+
152
+ ## if debug?
153
+ puts "http header - server: #{response.header['server']} - #{response.header['server'].class.name}"
154
+ puts "http header - etag: #{response.header['etag']} - #{response.header['etag'].class.name}"
155
+ puts "http header - last-modified: #{response.header['last-modified']} - #{response.header['last-modified'].class.name}"
156
+ ## end
157
+
158
+ ### note: might crash w/ encoding errors when saving in postgress
159
+ ## e.g. PG::CharacterNotInRepertoire: ERROR: ...
160
+ ## catch error, log it and stop for now
161
+ #
162
+ # in the future check for different charset than utf-8 ?? possible?? how to deal with non-utf8 charsets??
163
+
164
+ begin
165
+ feed_rec.update_attributes!( feed_attribs )
166
+ rescue Exception => e
167
+ # log db error; and continue
168
+ puts "*** error: updating feed database record '#{feed_key}' - #{e.to_s}"
169
+ Activity.create!( text: "*** error: updating feed database record '#{feed_key}' - #{e.to_s}" )
170
+ return nil # sorry; corrupt feed; parsing not possible; fix char encoding - make it an option in config??
171
+ end
172
+
173
+
174
+ logger.debug "feed_xml:"
175
+ logger.debug feed_xml[ 0..300 ] # get first 300 chars
176
+
177
+ puts "Before parsing feed >#{feed_key}<..."
178
+
179
+ ### move to feedutils
180
+ ### logger.debug "using stdlib RSS::VERSION #{RSS::VERSION}"
181
+
182
+
183
+ ### todo/fix:
184
+ ### return feed_xml !!! - move FeedUtils::Parser.parse to update or someting !!!
185
+
186
+ feed_xml
187
+ ## fix/todo: check for feed.nil? -> error parsing!!!
188
+ # or throw exception
189
+ ## feed = FeedUtils::Parser.parse( feed_xml )
190
+ ## feed
191
+ end
192
+
193
+
194
+ end # class FeedFetcherCondGetWithCache
195
+
196
+ end # module Pluto
@@ -0,0 +1,24 @@
1
+ # encoding: utf-8
2
+
3
+ module PlutoFeedFetcher
4
+
5
+ MAJOR = 0
6
+ MINOR = 1
7
+ PATCH = 0
8
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
9
+
10
+ def self.version
11
+ VERSION
12
+ end
13
+
14
+ def self.banner
15
+ ### todo: add RUBY_PATCHLEVEL or RUBY_PATCH_LEVEL e.g. -p124
16
+ "pluto-feedfetcher/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
17
+ end
18
+
19
+ def self.root
20
+ "#{File.expand_path( File.dirname(File.dirname(File.dirname(File.dirname(__FILE__)))) )}"
21
+ end
22
+
23
+ end # module PlutoFeedFetcher
24
+
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'pluto/models'
5
+
6
+
7
+ # more 3rd party gems
8
+ require 'fetcher' # fetch (text) documents
9
+
10
+
11
+ # our own code
12
+ require 'pluto/feedfetcher/version' # Note: let version always go first
13
+ require 'pluto/feedfetcher/basic'
14
+ require 'pluto/feedfetcher/cond_get_with_cache'
15
+
16
+
17
+
18
+ # say hello
19
+ puts PlutoFeedFetcher.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
20
+
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pluto-feedfetcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Gerald Bauer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pluto-models
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 1.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: fetcher
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: rdoc
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '4.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '4.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: hoe
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.13'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.13'
69
+ description: pluto-feedfetcher - fetch web feeds (w/ conditional HTTP get e.g. use
70
+ etags, if-modified-since etc.)
71
+ email: feedreader@googlegroups.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files:
75
+ - HISTORY.md
76
+ - Manifest.txt
77
+ - README.md
78
+ files:
79
+ - HISTORY.md
80
+ - Manifest.txt
81
+ - README.md
82
+ - Rakefile
83
+ - lib/pluto/feedfetcher.rb
84
+ - lib/pluto/feedfetcher/basic.rb
85
+ - lib/pluto/feedfetcher/cond_get_with_cache.rb
86
+ - lib/pluto/feedfetcher/version.rb
87
+ homepage: https://github.com/feedreader/pluto.feed.fetcher
88
+ licenses:
89
+ - Public Domain
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options:
93
+ - "--main"
94
+ - README.md
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.9.2
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.4.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: pluto-feedfetcher - fetch web feeds (w/ conditional HTTP get e.g. use etags,
113
+ if-modified-since etc.)
114
+ test_files: []