opml_janitor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 739ca7e9b2c084b9bcbb7d141f4d852ff908eb12
4
+ data.tar.gz: 12fea1d3defa5bb56ae7c37ee5d2240ee68e8020
5
+ SHA512:
6
+ metadata.gz: 9d9d7b67ea243c90f896ec75ee442c395c02ff6d26a6615f912a7ce581a2b2f3a4f761337f7a39233c7c6816cc576f2892ad46ab8d25fb27a545f9b0e70369e0
7
+ data.tar.gz: 201af0d908a7375fcf390da31b519c82434147ade3f4e16ea331a7c5305d0978819e7df168a801b9c2813b77c30cd4b015b1197f6aed9aa1bca8c0c1161f39ce
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+ gem 'opml_saw', :git => "git://github.com/feedbin/opml_saw.git", :branch => "master"
3
+ # Specify your gem's dependencies in opml_janitor.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 chrislee35
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,62 @@
1
+ # OpmlJanitor
2
+
3
+ Tool to clean up broken and stale RSS feeds from an OPML file.
4
+ It parses the XML, and for each feed, it downloads the RSS/Atom/etc., validates that the feed has been active within the given time frame, and writes the result to a new OPML XML document containing only the good feeds.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'opml_janitor'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install opml_janitor
19
+
20
+ ## Usage
21
+
22
+ require 'opml_janitor'
23
+
24
+ opml_janitor = OpmlJanitor::Parser.from_filehandle("example.opml")
25
+ opml_janitor.debug = true
26
+ opml_janitor.threads = 20
27
+ opml_janitor.validate!(Time.now - (30*24*60*60))
28
+ result = opml_janitor.to_xml
29
+
30
+
31
+ Or you could use the bundled tool
32
+
33
+ $ opml_janitor -h
34
+ Usage: opml_janitor [-hv] [-t <# threads>] [-s <date>] [-i <input file>] [-o <output file>]
35
+ -h displays this help text
36
+ -v turns on debugging messages (encouraged)
37
+ -t # set the number of threads (default 1) (highly encouraged)
38
+ -i specify a file for input (default is standard input)
39
+ -o specify a file for output (default is standard output)
40
+
41
+
42
+ Example:
43
+
44
+ $ opml_janitor -i subscriptions.xml -o sub.xml -s "2015-01-01" -v -t 20
45
+
46
+ * Reads subscriptions from subscriptions.xml
47
+ * Writes results to sub.xml
48
+ * Must have posts newer than 2015-01-01 00:00:00 (local time zone)
49
+ * Verbose messages
50
+ * 20 threads
51
+
52
+ ## Related
53
+
54
+ Related work: <a href='https://github.com/feedbin/opml_saw'>opml_saw</a>
55
+
56
+ ## Contributing
57
+
58
+ 1. Fork it ( https://github.com/chrislee35/opml_janitor/fork )
59
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
60
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
61
+ 4. Push to the branch (`git push origin my-new-feature`)
62
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rake/testtask'
5
+ require 'rdoc/task'
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.libs << 'lib'
9
+ t.test_files = FileList['test/test_*.rb']
10
+ t.verbose = true
11
+ end
12
+
13
+ RDoc::Task.new do |rd|
14
+ rd.main = "README.doc"
15
+ rd.rdoc_files.include("README.md", "lib/**/*.rb")
16
+ rd.options << "--all"
17
+ rd.options << "--verbose"
18
+ end
19
+
20
+ task :default => :test
data/bin/opml_janitor ADDED
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ require 'opml_janitor'
3
+ require 'getoptlong'
4
+ require 'time'
5
+
6
+ opts = GetoptLong.new(
7
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
8
+ [ '--debug', '-v', GetoptLong::NO_ARGUMENT ],
9
+ [ '--since', '-s', GetoptLong::REQUIRED_ARGUMENT ],
10
+ [ '--input', '-i', GetoptLong::REQUIRED_ARGUMENT ],
11
+ [ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
12
+ [ '--threads', '-t', GetoptLong::REQUIRED_ARGUMENT ]
13
+ )
14
+
15
+ options = {
16
+ :help => false,
17
+ :debug => false,
18
+ :threads => 1,
19
+ :since => nil,
20
+ :input => $stdin,
21
+ :output => $stdout
22
+ }
23
+
24
+ opts.each do |opt, arg|
25
+ case opt
26
+ when '--help'
27
+ options[:help] = true
28
+ when '--debug'
29
+ options[:debug] = true
30
+ when '--since'
31
+ options[:since] = Time.parse(arg)
32
+ when '--input'
33
+ options[:input] = File.open(arg, 'r')
34
+ when '--output'
35
+ options[:output] = File.open(arg, 'w')
36
+ when '--threads'
37
+ options[:threads] = arg.to_i
38
+ else
39
+ options[:help] = true
40
+ end
41
+ end
42
+
43
+ if options[:help]
44
+ puts "Usage: #{$0} [-hv] [-t <# threads>] [-s <date>] [-i <input file>] [-o <output file>]"
45
+ puts " -h displays this help text"
46
+ puts " -v turns on debugging messages (encouraged)"
47
+ puts " -t # set the number of threads (default 1) (highly encouraged)"
48
+ puts " -i specify a file for input (default is standard input)"
49
+ puts " -o specify a file for output (default is standard output)"
50
+ exit
51
+ end
52
+
53
+ opml_janitor = OpmlJanitor::Parser.from_filehandle(options[:input])
54
+ opml_janitor.debug = options[:debug]
55
+ opml_janitor.threads = options[:threads]
56
+ opml_janitor.validate!(options[:since])
57
+ result = opml_janitor.to_xml
58
+ options[:output].puts(result)
59
+
@@ -0,0 +1,177 @@
1
+ require "opml_janitor/version"
2
+ require "opml_janitor/outline"
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'rss'
6
+ require 'timeout'
7
+ require 'thread'
8
+ require 'pp'
9
+
10
+ module OpmlJanitor # :nodoc:
11
+ # The Parser class takes in the contents of an OPML XML document and
12
+ # can filter and save the results
13
+ class Parser
14
+ # initialize takes the contents of an OPML XML document and a flag
15
+ # for debug messages (default false)
16
+ def initialize(contents, debug = false)
17
+ @xml = contents
18
+ @opml = Nokogiri::XML.parse(@xml)
19
+ @debug = debug
20
+ @threads = 1
21
+ @timeout = 20
22
+ end
23
+
24
+ # debug= sets the debug flag
25
+ def debug=(debug)
26
+ @debug = debug
27
+ end
28
+
29
+ # timeout= sets the timeout for downloading and processing each feed
30
+ # the default is 20 seconds
31
+ def timeout=(timeout)
32
+ @timeout = timeout
33
+ end
34
+
35
+ ##
36
+ # threads= sets the number of threads for running the validation process
37
+ def threads=(threads)
38
+ @threads = threads
39
+ end
40
+
41
+ ##
42
+ # from_filehandle allows the OPML XML document to be read from a given
43
+ # filehandle and returns an initialized Parser instance
44
+ def self.from_filehandle(filehandle)
45
+ contents = filehandle.read
46
+ Parser.new(contents)
47
+ end
48
+
49
+ ##
50
+ # from_file takes in a filename and returns an initialized Parser instance
51
+ def self.from_file(filename)
52
+ from_filehandle(File.open(filename, 'r'))
53
+ end
54
+
55
+ ##
56
+ # from_url takes in a URL as a string and returns an initialized Parser instance
57
+ def self.from_url(url)
58
+ from_filehandle(open(url).read)
59
+ end
60
+
61
+ ##
62
+ # validate! takes in one argument, +since+, specifing a Time object. Since is used to check if any posts have been posted since that time, thus detecting "stale" blogs/rss feeds
63
+ def validate!(since = nil)
64
+ # this threading methodology is highly expensive for simple blocks, but a life-saver for IO-bound blocks
65
+ @work_queue = Queue.new
66
+ data = @opml.css("body").children
67
+ boss = Thread.new do
68
+ filter!(data)
69
+ end
70
+
71
+ workers = (0...@threads).map do
72
+ @work_queue.push(false) # this will end each thread
73
+ Thread.new do
74
+ begin
75
+ while work = @work_queue.pop()
76
+ val = validate_callback(work[:outline], since)
77
+ spaces = 80 - work[:outline][:xml_url].length
78
+ spaces = 1 if spaces < 1
79
+ puts "#{work[:outline][:xml_url]}#{' ' * spaces}#{val}" if @debug
80
+ unless val == "PASS"
81
+ work[:node].unlink
82
+ end
83
+ end
84
+ rescue ThreadError
85
+ end
86
+ end
87
+ end
88
+ boss.join
89
+ workers.map(&:join)
90
+ end
91
+
92
+ ##
93
+ # to_xml outputs the current OPML XML structure as a String containing all the XML markup
94
+ def to_xml
95
+ @opml.to_xml
96
+ end
97
+
98
+ private
99
+
100
+ ##
101
+ # filter! recurses down the OPML body, looking for outline tags, and pushes each
102
+ # leaf node onto a work queue
103
+ def filter!(data)
104
+ data.each do |node|
105
+ if node.name == 'outline'
106
+ outline = Outline.new(node).to_hash
107
+ if node.children.length > 0
108
+ title = outline[:title] || outline[:text]
109
+ filter!(node.children)
110
+ else
111
+ @work_queue.push({ :outline => outline, :node => node})
112
+ end
113
+ end
114
+ end
115
+ end
116
+
117
+ ##
118
+ # validate_callback tries to download a feed and verify that it has been updated
119
+ # since the +since+ time
120
+ def validate_callback(feed, since=nil)
121
+ val = "FAIL"
122
+ begin
123
+ Timeout::timeout(@timeout) {
124
+ open(feed[:xml_url]) do |rss|
125
+ feed = RSS::Parser.parse(rss)
126
+ if feed
127
+ last_updated = Time.at(0)
128
+ feed.items.each do |item|
129
+ #p item.class
130
+ updated = nil
131
+ if item.respond_to?(:updated)
132
+ updated = item.updated.content
133
+ elsif item.respond_to?(:date)
134
+ updated = item.date
135
+ end
136
+ next unless updated
137
+ if updated and updated > last_updated
138
+ last_updated = updated
139
+ end
140
+ end
141
+ if since
142
+ #p last_updated
143
+ if last_updated and last_updated > since
144
+ val = "PASS"
145
+ else
146
+ val = "STALE"
147
+ end
148
+ else
149
+ val = "PASS"
150
+ end
151
+ else
152
+ val = "NOFEED"
153
+ end
154
+ end
155
+ }
156
+ rescue EOFError => e
157
+ val = "EOFError"
158
+ rescue OpenURI::HTTPError => e
159
+ val = "HTTPError"
160
+ rescue RSS::Error => e
161
+ val = "RSSError"
162
+ rescue Timeout::Error => e
163
+ val = "Timedout"
164
+ rescue SocketError => e
165
+ val = "SocketError"
166
+ rescue RuntimeError => e
167
+ val = "Redirect Loop"
168
+ rescue Errno::ECONNREFUSED => e
169
+ val = "Connection Refused"
170
+ rescue Exception => e
171
+ val = "Unexpected error: #{e}"
172
+ end
173
+ val
174
+ end
175
+ end
176
+
177
+ end
@@ -0,0 +1,35 @@
1
+ module OpmlJanitor
2
+
3
+ class Outline
4
+
5
+ attr_reader :hash
6
+
7
+ def initialize(node)
8
+ @node = node
9
+ @hash = {}
10
+ end
11
+
12
+ def to_hash
13
+ @node.attributes.each do |attribute|
14
+ key = underscore(attribute[0]).to_sym
15
+ @hash[key] = @node.attr(attribute[0])
16
+ end
17
+ @hash
18
+ end
19
+
20
+ private
21
+
22
+ # from ActiveSupport
23
+ def underscore(camel_cased_word)
24
+ word = camel_cased_word.to_s.dup
25
+ word.gsub!(/::/, '/')
26
+ word.gsub!(/([A-Z\d]+)([A-Z][a-z])/,'\1_\2')
27
+ word.gsub!(/([a-z\d])([A-Z])/,'\1_\2')
28
+ word.tr!("-", "_")
29
+ word.downcase!
30
+ word
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -0,0 +1,3 @@
1
+ module OpmlJanitor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'opml_janitor/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "opml_janitor"
8
+ spec.version = OpmlJanitor::VERSION
9
+ spec.authors = ["chrislee35"]
10
+ spec.email = ["rubygems@chrislee.dhs.org"]
11
+ spec.summary = %q{Parses an OPML file, verifies the feeds, and writes the resulting OPML}
12
+ spec.description = %q{This gem provides a tool for cleaning up OPML feeds.}
13
+ spec.homepage = "https://github.com/chrislee35/opml_janitor"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency "nokogiri", "~> 1.6"
22
+ spec.add_development_dependency "minitest", "~> 5.5"
23
+ spec.add_development_dependency "bundler", "~> 1.7"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/test'
3
+ require 'minitest/unit'
4
+ require 'minitest/pride'
5
+ include MiniTest::Assertions
6
+ require File.expand_path('../../lib/opml_janitor.rb', __FILE__)
@@ -0,0 +1,27 @@
1
+ unless Kernel.respond_to?(:require_relative)
2
+ module Kernel
3
+ def require_relative(path)
4
+ require File.join(File.dirname(caller[0]), path.to_str)
5
+ end
6
+ end
7
+ end
8
+
9
+ require_relative 'helper'
10
+ require 'pp'
11
+
12
+ class TestOPMLJanitor < Minitest::Test
13
+ def test_opml_parse
14
+ #opmljanitor = OpmlJanitor::Parser.from_file("test/test.opml")
15
+ end
16
+
17
+ def test_opml_validation
18
+ #opmljanitor = OpmlJanitor::Parser.from_file("test/test2.opml")
19
+ #pp opmljanitor.validate
20
+ end
21
+
22
+ def test_opml_validation_with_time
23
+ opmljanitor = OpmlJanitor::Parser.from_file("test/test2.opml")
24
+ opmljanitor.validate!(Time.now - (265*24*60*60))
25
+ xml = opmljanitor.to_xml
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: opml_janitor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - chrislee35
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '5.5'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.5'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.7'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.7'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ description: This gem provides a tool for cleaning up OPML feeds.
70
+ email:
71
+ - rubygems@chrislee.dhs.org
72
+ executables:
73
+ - opml_janitor
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - bin/opml_janitor
83
+ - lib/opml_janitor.rb
84
+ - lib/opml_janitor/outline.rb
85
+ - lib/opml_janitor/version.rb
86
+ - opml_janitor.gemspec
87
+ - test/helper.rb
88
+ - test/test_opml_janitor.rb
89
+ homepage: https://github.com/chrislee35/opml_janitor
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Parses an OPML file, verifies the feeds, and writes the resulting OPML
113
+ test_files:
114
+ - test/helper.rb
115
+ - test/test_opml_janitor.rb