opml_janitor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 739ca7e9b2c084b9bcbb7d141f4d852ff908eb12
4
+ data.tar.gz: 12fea1d3defa5bb56ae7c37ee5d2240ee68e8020
5
+ SHA512:
6
+ metadata.gz: 9d9d7b67ea243c90f896ec75ee442c395c02ff6d26a6615f912a7ce581a2b2f3a4f761337f7a39233c7c6816cc576f2892ad46ab8d25fb27a545f9b0e70369e0
7
+ data.tar.gz: 201af0d908a7375fcf390da31b519c82434147ade3f4e16ea331a7c5305d0978819e7df168a801b9c2813b77c30cd4b015b1197f6aed9aa1bca8c0c1161f39ce
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+ gem 'opml_saw', :git => "git://github.com/feedbin/opml_saw.git", :branch => "master"
3
+ # Specify your gem's dependencies in opml_janitor.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 chrislee35
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,62 @@
1
+ # OpmlJanitor
2
+
3
+ Tool to clean up broken and stale RSS feeds from an OPML file.
4
+ It parses the XML, and for each feed, it downloads the RSS/Atom/etc., validates that the feed has been active within the given time frame, and writes the result to a new OPML XML document containing only the good feeds.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'opml_janitor'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install opml_janitor
19
+
20
+ ## Usage
21
+
22
+ require 'opml_janitor'
23
+
24
+ opml_janitor = OpmlJanitor::Parser.from_filehandle("example.opml")
25
+ opml_janitor.debug = true
26
+ opml_janitor.threads = 20
27
+ opml_janitor.validate!(Time.now - (30*24*60*60))
28
+ result = opml_janitor.to_xml
29
+
30
+
31
+ Or you could use the bundled tool
32
+
33
+ $ opml_janitor -h
34
+ Usage: opml_janitor [-hv] [-t <# threads>] [-s <date>] [-i <input file>] [-o <output file>]
35
+ -h displays this help text
36
+ -v turns on debugging messages (encouraged)
37
+ -t # set the number of threads (default 1) (highly encouraged)
38
+ -i specify a file for input (default is standard input)
39
+ -o specify a file for output (default is standard output)
40
+
41
+
42
+ Example:
43
+
44
+ $ opml_janitor -i subscriptions.xml -o sub.xml -s "2015-01-01" -v -t 20
45
+
46
+ * Reads subscriptions from subscriptions.xml
47
+ * Writes results to sub.xml
48
+ * Must have posts newer than 2015-01-01 00:00:00 (local time zone)
49
+ * Verbose messages
50
+ * 20 threads
51
+
52
+ ## Related
53
+
54
+ Related work: <a href='https://github.com/feedbin/opml_saw'>opml_saw</a>
55
+
56
+ ## Contributing
57
+
58
+ 1. Fork it ( https://github.com/chrislee35/opml_janitor/fork )
59
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
60
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
61
+ 4. Push to the branch (`git push origin my-new-feature`)
62
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rake/testtask'
5
+ require 'rdoc/task'
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.libs << 'lib'
9
+ t.test_files = FileList['test/test_*.rb']
10
+ t.verbose = true
11
+ end
12
+
13
+ RDoc::Task.new do |rd|
14
+ rd.main = "README.doc"
15
+ rd.rdoc_files.include("README.md", "lib/**/*.rb")
16
+ rd.options << "--all"
17
+ rd.options << "--verbose"
18
+ end
19
+
20
+ task :default => :test
data/bin/opml_janitor ADDED
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ require 'opml_janitor'
3
+ require 'getoptlong'
4
+ require 'time'
5
+
6
+ opts = GetoptLong.new(
7
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
8
+ [ '--debug', '-v', GetoptLong::NO_ARGUMENT ],
9
+ [ '--since', '-s', GetoptLong::REQUIRED_ARGUMENT ],
10
+ [ '--input', '-i', GetoptLong::REQUIRED_ARGUMENT ],
11
+ [ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
12
+ [ '--threads', '-t', GetoptLong::REQUIRED_ARGUMENT ]
13
+ )
14
+
15
+ options = {
16
+ :help => false,
17
+ :debug => false,
18
+ :threads => 1,
19
+ :since => nil,
20
+ :input => $stdin,
21
+ :output => $stdout
22
+ }
23
+
24
+ opts.each do |opt, arg|
25
+ case opt
26
+ when '--help'
27
+ options[:help] = true
28
+ when '--debug'
29
+ options[:debug] = true
30
+ when '--since'
31
+ options[:since] = Time.parse(arg)
32
+ when '--input'
33
+ options[:input] = File.open(arg, 'r')
34
+ when '--output'
35
+ options[:output] = File.open(arg, 'w')
36
+ when '--threads'
37
+ options[:threads] = arg.to_i
38
+ else
39
+ options[:help] = true
40
+ end
41
+ end
42
+
43
+ if options[:help]
44
+ puts "Usage: #{$0} [-hv] [-t <# threads>] [-s <date>] [-i <input file>] [-o <output file>]"
45
+ puts " -h displays this help text"
46
+ puts " -v turns on debugging messages (encouraged)"
47
+ puts " -t # set the number of threads (default 1) (highly encouraged)"
48
+ puts " -i specify a file for input (default is standard input)"
49
+ puts " -o specify a file for output (default is standard output)"
50
+ exit
51
+ end
52
+
53
+ opml_janitor = OpmlJanitor::Parser.from_filehandle(options[:input])
54
+ opml_janitor.debug = options[:debug]
55
+ opml_janitor.threads = options[:threads]
56
+ opml_janitor.validate!(options[:since])
57
+ result = opml_janitor.to_xml
58
+ options[:output].puts(result)
59
+
@@ -0,0 +1,177 @@
1
+ require "opml_janitor/version"
2
+ require "opml_janitor/outline"
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'rss'
6
+ require 'timeout'
7
+ require 'thread'
8
+ require 'pp'
9
+
10
+ module OpmlJanitor # :nodoc:
11
+ # The Parser class takes in the contents of an OPML XML document and
12
+ # can filter and save the results
13
+ class Parser
14
+ # initialize takes the contents of an OPML XML document and a flag
15
+ # for debug messages (default false)
16
+ def initialize(contents, debug = false)
17
+ @xml = contents
18
+ @opml = Nokogiri::XML.parse(@xml)
19
+ @debug = debug
20
+ @threads = 1
21
+ @timeout = 20
22
+ end
23
+
24
+ # debug= sets the debug flag
25
+ def debug=(debug)
26
+ @debug = debug
27
+ end
28
+
29
+ # timeout= sets the timeout for downloading and processing each feed
30
+ # the default is 20 seconds
31
+ def timeout=(timeout)
32
+ @timeout = timeout
33
+ end
34
+
35
+ ##
36
+ # threads= sets the number of threads for running the validation process
37
+ def threads=(threads)
38
+ @threads = threads
39
+ end
40
+
41
+ ##
42
+ # from_filehandle allows the OPML XML document to be read from a given
43
+ # filehandle and returns an initialized Parser instance
44
+ def self.from_filehandle(filehandle)
45
+ contents = filehandle.read
46
+ Parser.new(contents)
47
+ end
48
+
49
+ ##
50
+ # from_file takes in a filename and returns an initialized Parser instance
51
+ def self.from_file(filename)
52
+ from_filehandle(File.open(filename, 'r'))
53
+ end
54
+
55
+ ##
56
+ # from_url takes in a URL as a string and returns an initialized Parser instance
57
+ def self.from_url(url)
58
+ from_filehandle(open(url).read)
59
+ end
60
+
61
+ ##
62
+ # validate! takes in one argument, +since+, specifing a Time object. Since is used to check if any posts have been posted since that time, thus detecting "stale" blogs/rss feeds
63
+ def validate!(since = nil)
64
+ # this threading methodology is highly expensive for simple blocks, but a life-saver for IO-bound blocks
65
+ @work_queue = Queue.new
66
+ data = @opml.css("body").children
67
+ boss = Thread.new do
68
+ filter!(data)
69
+ end
70
+
71
+ workers = (0...@threads).map do
72
+ @work_queue.push(false) # this will end each thread
73
+ Thread.new do
74
+ begin
75
+ while work = @work_queue.pop()
76
+ val = validate_callback(work[:outline], since)
77
+ spaces = 80 - work[:outline][:xml_url].length
78
+ spaces = 1 if spaces < 1
79
+ puts "#{work[:outline][:xml_url]}#{' ' * spaces}#{val}" if @debug
80
+ unless val == "PASS"
81
+ work[:node].unlink
82
+ end
83
+ end
84
+ rescue ThreadError
85
+ end
86
+ end
87
+ end
88
+ boss.join
89
+ workers.map(&:join)
90
+ end
91
+
92
+ ##
93
+ # to_xml outputs the current OPML XML structure as a String containing all the XML markup
94
+ def to_xml
95
+ @opml.to_xml
96
+ end
97
+
98
+ private
99
+
100
+ ##
101
+ # filter! recurses down the OPML body, looking for outline tags, and pushes each
102
+ # leaf node onto a work queue
103
+ def filter!(data)
104
+ data.each do |node|
105
+ if node.name == 'outline'
106
+ outline = Outline.new(node).to_hash
107
+ if node.children.length > 0
108
+ title = outline[:title] || outline[:text]
109
+ filter!(node.children)
110
+ else
111
+ @work_queue.push({ :outline => outline, :node => node})
112
+ end
113
+ end
114
+ end
115
+ end
116
+
117
+ ##
118
+ # validate_callback tries to download a feed and verify that it has been updated
119
+ # since the +since+ time
120
+ def validate_callback(feed, since=nil)
121
+ val = "FAIL"
122
+ begin
123
+ Timeout::timeout(@timeout) {
124
+ open(feed[:xml_url]) do |rss|
125
+ feed = RSS::Parser.parse(rss)
126
+ if feed
127
+ last_updated = Time.at(0)
128
+ feed.items.each do |item|
129
+ #p item.class
130
+ updated = nil
131
+ if item.respond_to?(:updated)
132
+ updated = item.updated.content
133
+ elsif item.respond_to?(:date)
134
+ updated = item.date
135
+ end
136
+ next unless updated
137
+ if updated and updated > last_updated
138
+ last_updated = updated
139
+ end
140
+ end
141
+ if since
142
+ #p last_updated
143
+ if last_updated and last_updated > since
144
+ val = "PASS"
145
+ else
146
+ val = "STALE"
147
+ end
148
+ else
149
+ val = "PASS"
150
+ end
151
+ else
152
+ val = "NOFEED"
153
+ end
154
+ end
155
+ }
156
+ rescue EOFError => e
157
+ val = "EOFError"
158
+ rescue OpenURI::HTTPError => e
159
+ val = "HTTPError"
160
+ rescue RSS::Error => e
161
+ val = "RSSError"
162
+ rescue Timeout::Error => e
163
+ val = "Timedout"
164
+ rescue SocketError => e
165
+ val = "SocketError"
166
+ rescue RuntimeError => e
167
+ val = "Redirect Loop"
168
+ rescue Errno::ECONNREFUSED => e
169
+ val = "Connection Refused"
170
+ rescue Exception => e
171
+ val = "Unexpected error: #{e}"
172
+ end
173
+ val
174
+ end
175
+ end
176
+
177
+ end
@@ -0,0 +1,35 @@
1
+ module OpmlJanitor
2
+
3
+ class Outline
4
+
5
+ attr_reader :hash
6
+
7
+ def initialize(node)
8
+ @node = node
9
+ @hash = {}
10
+ end
11
+
12
+ def to_hash
13
+ @node.attributes.each do |attribute|
14
+ key = underscore(attribute[0]).to_sym
15
+ @hash[key] = @node.attr(attribute[0])
16
+ end
17
+ @hash
18
+ end
19
+
20
+ private
21
+
22
+ # from ActiveSupport
23
+ def underscore(camel_cased_word)
24
+ word = camel_cased_word.to_s.dup
25
+ word.gsub!(/::/, '/')
26
+ word.gsub!(/([A-Z\d]+)([A-Z][a-z])/,'\1_\2')
27
+ word.gsub!(/([a-z\d])([A-Z])/,'\1_\2')
28
+ word.tr!("-", "_")
29
+ word.downcase!
30
+ word
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -0,0 +1,3 @@
1
+ module OpmlJanitor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'opml_janitor/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "opml_janitor"
8
+ spec.version = OpmlJanitor::VERSION
9
+ spec.authors = ["chrislee35"]
10
+ spec.email = ["rubygems@chrislee.dhs.org"]
11
+ spec.summary = %q{Parses an OPML file, verifies the feeds, and writes the resulting OPML}
12
+ spec.description = %q{This gem provides a tool for cleaning up OPML feeds.}
13
+ spec.homepage = "https://github.com/chrislee35/opml_janitor"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency "nokogiri", "~> 1.6"
22
+ spec.add_development_dependency "minitest", "~> 5.5"
23
+ spec.add_development_dependency "bundler", "~> 1.7"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/test'
3
+ require 'minitest/unit'
4
+ require 'minitest/pride'
5
+ include MiniTest::Assertions
6
+ require File.expand_path('../../lib/opml_janitor.rb', __FILE__)
@@ -0,0 +1,27 @@
1
+ unless Kernel.respond_to?(:require_relative)
2
+ module Kernel
3
+ def require_relative(path)
4
+ require File.join(File.dirname(caller[0]), path.to_str)
5
+ end
6
+ end
7
+ end
8
+
9
+ require_relative 'helper'
10
+ require 'pp'
11
+
12
+ class TestOPMLJanitor < Minitest::Test
13
+ def test_opml_parse
14
+ #opmljanitor = OpmlJanitor::Parser.from_file("test/test.opml")
15
+ end
16
+
17
+ def test_opml_validation
18
+ #opmljanitor = OpmlJanitor::Parser.from_file("test/test2.opml")
19
+ #pp opmljanitor.validate
20
+ end
21
+
22
+ def test_opml_validation_with_time
23
+ opmljanitor = OpmlJanitor::Parser.from_file("test/test2.opml")
24
+ opmljanitor.validate!(Time.now - (265*24*60*60))
25
+ xml = opmljanitor.to_xml
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: opml_janitor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - chrislee35
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '5.5'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.5'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.7'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.7'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ description: This gem provides a tool for cleaning up OPML feeds.
70
+ email:
71
+ - rubygems@chrislee.dhs.org
72
+ executables:
73
+ - opml_janitor
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - bin/opml_janitor
83
+ - lib/opml_janitor.rb
84
+ - lib/opml_janitor/outline.rb
85
+ - lib/opml_janitor/version.rb
86
+ - opml_janitor.gemspec
87
+ - test/helper.rb
88
+ - test/test_opml_janitor.rb
89
+ homepage: https://github.com/chrislee35/opml_janitor
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Parses an OPML file, verifies the feeds, and writes the resulting OPML
113
+ test_files:
114
+ - test/helper.rb
115
+ - test/test_opml_janitor.rb