harvester 0.8.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/CHANGELOG.rdoc +45 -0
  2. data/README.rdoc +74 -0
  3. data/Rakefile +28 -0
  4. data/bin/harvester +13 -0
  5. data/bin/harvester-chart +5 -0
  6. data/bin/harvester-clock +35 -0
  7. data/bin/harvester-db +15 -0
  8. data/bin/harvester-fetch +5 -0
  9. data/bin/harvester-generate +5 -0
  10. data/bin/harvester-jabber +6 -0
  11. data/bin/harvester-new +25 -0
  12. data/bin/harvester-post +5 -0
  13. data/bin/harvester-run +14 -0
  14. data/collections.yaml +15 -0
  15. data/config.yaml +13 -0
  16. data/data/ent/HTMLlat1.ent +194 -0
  17. data/data/ent/HTMLspecial.ent +77 -0
  18. data/data/ent/HTMLsymbol.ent +241 -0
  19. data/data/sql/dbd-mysql-isotime.diff +11 -0
  20. data/data/sql/harvester-0.6-mysql.diff +59 -0
  21. data/data/sql/harvester-0.7-mysql.diff +39 -0
  22. data/data/sql/mysql/chart.sql +1 -0
  23. data/data/sql/mysql/create.table.enclosures.sql +9 -0
  24. data/data/sql/mysql/create.table.items.sql +8 -0
  25. data/data/sql/mysql/create.table.jabbersettings.sql +5 -0
  26. data/data/sql/mysql/create.table.jabbersubscriptions.sql +5 -0
  27. data/data/sql/mysql/create.table.sources.sql +9 -0
  28. data/data/sql/mysql/create.view.last48hours.sql +1 -0
  29. data/data/sql/postgresql/chart.sql +1 -0
  30. data/data/sql/postgresql/create.table.enclosures.sql +9 -0
  31. data/data/sql/postgresql/create.table.items.sql +8 -0
  32. data/data/sql/postgresql/create.table.jabbersettings.sql +5 -0
  33. data/data/sql/postgresql/create.table.jabbersubscriptions.sql +5 -0
  34. data/data/sql/postgresql/create.table.sources.sql +9 -0
  35. data/data/sql/postgresql/create.view.last48hours.sql +1 -0
  36. data/data/sql/sqlite3/chart.sql +1 -0
  37. data/data/sql/sqlite3/create.table.enclosures.sql +9 -0
  38. data/data/sql/sqlite3/create.table.items.sql +8 -0
  39. data/data/sql/sqlite3/create.table.jabbersettings.sql +5 -0
  40. data/data/sql/sqlite3/create.table.jabbersubscriptions.sql +5 -0
  41. data/data/sql/sqlite3/create.table.sources.sql +9 -0
  42. data/data/sql/sqlite3/create.view.last48hours.sql +1 -0
  43. data/data/templates/atom-all.xml +88 -0
  44. data/data/templates/atom.xml +88 -0
  45. data/data/templates/index.html +412 -0
  46. data/data/templates/rss-all.rdf +86 -0
  47. data/data/templates/rss.rdf +85 -0
  48. data/data/templates/static/harvester.css +365 -0
  49. data/data/templates/static/harvester.gif +0 -0
  50. data/data/templates/static/harvester_ie7.css +15 -0
  51. data/data/templates/static/harvester_lte_ie6.css +27 -0
  52. data/harvester.gemspec +35 -0
  53. data/lib/harvester.rb +132 -0
  54. data/lib/harvester/chart.rb +72 -0
  55. data/lib/harvester/db.rb +123 -0
  56. data/lib/harvester/fetch.rb +96 -0
  57. data/lib/harvester/generate.rb +152 -0
  58. data/lib/harvester/generator/entity_translator.rb +46 -0
  59. data/lib/harvester/generator/link_absolutizer.rb +39 -0
  60. data/lib/harvester/jabber.rb +443 -0
  61. data/lib/harvester/mrss.rb +355 -0
  62. data/lib/harvester/post.rb +19 -0
  63. metadata +237 -0
Binary file
@@ -0,0 +1,15 @@
1
+ /*
2
+ * harvester_ie7.css
3
+ *
4
+ * Description: Anpassungen des Defaultstylesheets 'harvester.css'
5
+ für IE7
6
+ *
7
+ */
8
+
9
+ /* ------------------------------------------------------ */
10
+ /* Blogeintraege */
11
+ /* ------------------------------------------------------ */
12
+
13
+ .entry-content pre, .entry-content code {
14
+ overflow: scroll;
15
+ }
@@ -0,0 +1,27 @@
1
+ /*
2
+ * harvester_lte_ie6.css
3
+ *
4
+ * Description: Anpassungen des Defaultstylesheets 'harvester.css'
5
+ für IE6 und aelter Versionen
6
+ *
7
+ */
8
+
9
+ /* ------------------------------------------------------ */
10
+ /* Blogeintraege */
11
+ /* ------------------------------------------------------ */
12
+
13
+ .entry {
14
+ width: 100%;
15
+ /*width: 98%;*/ /* dirty hack */
16
+ }
17
+
18
+ .entry-content pre,
19
+ .entry-content code {
20
+ width: 90%; /* dirty hack */
21
+ overflow: scroll;
22
+ }
23
+
24
+ .tickers3 .ticker ul li {
25
+ padding-bottom: 1em;
26
+ width: 100%;
27
+ }
data/harvester.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # encoding: utf-8
2
+ require 'rubygems' unless defined? Gem
3
+ require File.dirname(__FILE__) + "/lib/harvester"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "harvester"
7
+ s.version = Harvester::VERSION
8
+ s.authors = ["astro", "Neingeist", "Tigion", "Josef Spillner", "Jan Lelis"]
9
+ s.email = "mail@janlelis.de"
10
+ s.homepage = "https://github.com/janlelis/harvester"
11
+ s.summary = "Web-based feed aggregator"
12
+ s.description = "The harvester collects your favourite feeds and generates static html/feed pages"
13
+ s.required_ruby_version = ">= 1.9.2"
14
+ s.required_rubygems_version = ">= 1.3.6"
15
+ # main
16
+ s.add_dependency 'rdbi'
17
+ s.add_dependency 'rdbi-driver-sqlite3'
18
+ s.add_dependency 'logger-colors'
19
+ # fetch
20
+ s.add_dependency 'eventmachine'
21
+ s.add_dependency 'em-http-request'
22
+ # generate
23
+ s.add_dependency 'ruby-xslt'
24
+ s.add_dependency 'hpricot'
25
+ # chart
26
+ s.add_dependency 'rmagick'
27
+ s.add_dependency 'gruff'
28
+ # clock
29
+ s.add_dependency 'clockwork'
30
+ # jabber
31
+ # s.add_dependency 'xmpp4r'
32
+ s.files = Dir.glob(%w|lib/**/*.rb bin/* [A-Z]*.{txt,rdoc} data/**/* *.yaml|) + %w|Rakefile harvester.gemspec|
33
+ s.executables = Dir['bin/*'].map{|f| File.basename f }
34
+ s.license = "AGPL"
35
+ end
data/lib/harvester.rb ADDED
@@ -0,0 +1,132 @@
1
+ require 'rubygems' unless defined? Gem
2
+ require 'rdbi'
3
+ require 'yaml'
4
+ require 'logger/colors'
5
+
6
+ class Harvester
7
+ VERSION = '0.8.0.pre.1'
8
+
9
+ attr_reader :config, :settings, :collections, :dbi, :logger
10
+
11
+ # takes an options string (to which overwrites settings) and creates a new Harvester instance
12
+ def initialize(options = {})
13
+ # command line options
14
+ options['config'] ||= './config.yaml'
15
+
16
+ # load config
17
+ begin
18
+ config_path = File.expand_path( options.delete('config') )
19
+ @config = YAML::load_file config_path
20
+ rescue Errno::ENOENT
21
+ raise LoadError, "Could not find a yaml config file at #{ config_path }"
22
+ end
23
+
24
+ # instance variable helpers
25
+ @settings = {
26
+ 'collections' => 'collections.yaml',
27
+ 'timeout' => 90,
28
+ 'size limit' => 200_000,
29
+ 'log_level' => Logger::DEBUG, # 0
30
+ 'log_file' => STDOUT,
31
+ }
32
+ @settings.merge! @config['settings']
33
+ @settings.merge! options
34
+
35
+ # load collections
36
+ begin
37
+ @collections = YAML::load_file @settings['collections']
38
+ rescue Errno::ENOENT
39
+ raise LoadError, "Could not find a yaml collections file at #{ File.expand_path( options[:config] ) }"
40
+ end
41
+
42
+ # init logger
43
+ @logger = Logger.new(
44
+ if !@settings['log_file'] || @settings['log_file'] =~ /^STD(?:OUT|ERR)$/
45
+ Object.const_get(@settings['log_file'])
46
+ else
47
+ @settings['log_file']
48
+ end
49
+ )
50
+ @logger.formatter = proc { |level, datetime, appname, msg| "#{msg}\n" }
51
+ @logger.level = if %w[debug info warn error fatal].include?(@settings['log_level'].to_s.downcase)
52
+ Logger::Severity.const_get(@settings['log_level'].to_s.upcase)
53
+ else
54
+ @settings['log_level'].to_i
55
+ end
56
+
57
+ # connect to db
58
+ begin
59
+ require 'rdbi/driver/' + config['db']['driver'].downcase # FIXME?
60
+
61
+ @dbi = RDBI::connect config['db']['driver'],
62
+ database: config['db']['database'],
63
+ user: config['db']['user'],
64
+ password: config['db']['password']
65
+ rescue Exception
66
+ error 'Something is wrong with your database settings:'
67
+ raise
68
+ end
69
+ end
70
+
71
+ # creates a new harvester using the command-line options to configure it
72
+ def self.new_from_argv
73
+ options = {}
74
+
75
+ require 'optparse'
76
+ OptionParser.new do |op|
77
+ op.banner = %q{USAGE:
78
+ harvester <COMMAND> [OPTIONS]
79
+ COMMANDS:
80
+ run run a complete harvester update
81
+ fetch run only the fetch script
82
+ generate run only the generate script
83
+ chart run only the generate chart script
84
+ post run only the post processing script
85
+ db start a database task (create or maintenance)
86
+ clock start the scheduler (cron replacement)
87
+ new create a new harvester project
88
+ jabber start the jabber bot (not implemented yet)
89
+ OPTIONS:} # automatically added as --help
90
+ op.on('-v', '--version') do
91
+ puts Harvester::VERSION
92
+ exit
93
+ end
94
+ op.on('-c', '--config FILE') do |config|
95
+ options['config'] = config
96
+ end
97
+ op.on('-l', '--log_file FILE') do |log_file|
98
+ options['log_file'] = log_file
99
+ end
100
+ op.on('-L', '--log_level NUMBER') do |log_level|
101
+ options['log_level'] = log_level
102
+ end
103
+ op.on('-p', '--post_script FILE') do |post_script|
104
+ options['post_script'] = post_script
105
+ end
106
+ op.on('-m', '--no-maintenance') do
107
+ options['no-maintenance'] = true
108
+ end
109
+ op.on('-s', '--no-chart') do
110
+ options['no-chart'] = true
111
+ end
112
+ end.parse!
113
+
114
+ Harvester.new options
115
+ end
116
+
117
+ protected
118
+
119
+ # logger helpers
120
+ def debug(msg) @logger.debug(msg) end
121
+ def info(msg) @logger.info(msg) end
122
+ def warn(msg) @logger.warn(msg) end
123
+ def error(msg) @logger.error(msg) end
124
+ def fatal(msg) @logger.fatal(msg) end
125
+
126
+ # adds an info message before and after the block
127
+ def task(msg) # MAYBE: nested spaces+behaviour
128
+ info "[start] " + msg
129
+ yield
130
+ info "[done ] " + msg
131
+ end
132
+ end
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative '../harvester'
4
+ require_relative '../harvester/db'
5
+ require 'gruff'
6
+
7
+ class Harvester
8
+ module CHART; end
9
+
10
+ # generates a fetch statistic image
11
+ def chart!
12
+ info "CHART"
13
+ task "generate chart" do
14
+ c = Chart::StatsPerCollection.new
15
+ @dbi.execute( File.read( sql_query(:chart) ) ).each{ |date,collection|
16
+ c.add_one(collection, Date.parse(date).day)
17
+ }
18
+ Chart.new(c).write File.join( @config['settings']['output'], '/chart.jpg' )
19
+ end
20
+ end
21
+ end
22
+
23
+ # generates a fetch statistics image using gruff
24
+ class Harvester::Chart
25
+ def initialize(stats, options = {}) # TODO configure g with options
26
+ @g = Gruff::Line.new(300)
27
+ @g.title = "Harvested items per day"
28
+ @g.x_axis_label = "Days"
29
+ @g.y_axis_label = "Items"
30
+
31
+ stats.each(&@g.method(:data))
32
+
33
+ labels = {}
34
+ stats.days.each_with_index do |d,i|
35
+ labels[i] = d.to_s
36
+ end
37
+ @g.labels = labels
38
+ end
39
+
40
+ def write(path)
41
+ @g.write(path)
42
+ end
43
+ end
44
+
45
+ class Harvester::Chart::StatsPerCollection
46
+ attr_reader :days
47
+
48
+ def initialize
49
+ @collections = {}
50
+ @days = []
51
+ end
52
+
53
+ def add_one(collection, day)
54
+ @days << day unless @days.index(day)
55
+ collection ||= '(unknown)' # TODO research
56
+
57
+ c = @collections[collection] || {}
58
+ c[day] = (c[day] || 0) + 1
59
+ @collections[collection] = c
60
+ end
61
+
62
+ def each
63
+ @collections.each { |n,c|
64
+ v = []
65
+ @days.each { |d|
66
+ v << c[d].to_i
67
+ }
68
+
69
+ yield n, v
70
+ }
71
+ end
72
+ end
@@ -0,0 +1,123 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative '../harvester'
4
+
5
+ class Harvester
6
+ module DB; end
7
+
8
+ # creates required database structure
9
+ def create!
10
+ task "create database tables" do
11
+ begin @dbi.transaction do
12
+ sql_queries(:create).each{ |sql|
13
+ info "* execute " + File.basename(sql)
14
+ @dbi.execute File.read(sql)
15
+ }
16
+ end; end
17
+ end
18
+ end
19
+
20
+ # check for feed source changes
21
+ def maintenance!
22
+ task "look for sources to purge" do
23
+ purge = []
24
+ @dbi.execute("SELECT collection, rss FROM sources").each{ |dbc,dbr|
25
+ purge << [dbc, dbr] unless (@collections[dbc] || []).include? dbr
26
+ }
27
+
28
+ purge_rss = []
29
+ purge.each { |c,r|
30
+ info "* remove #{c}:#{r}..."
31
+ @dbi.execute "DELETE FROM sources WHERE collection=? AND rss=?", c, r
32
+ purge_rss << r
33
+ }
34
+
35
+ purge_rss.delete_if { |r|
36
+ purge_this = true
37
+
38
+ @collections.each { |cfc,cfr|
39
+ if purge_this
40
+ warn "* must keep #{r} because it's still in #{cfc}" if cfr && cfr.include?(r)
41
+ purge_this = !(cfr && cfr.include?(r))
42
+ end
43
+ }
44
+
45
+ !purge_this
46
+ }
47
+ purge_rss.each { |r|
48
+ info "* purge items from feed #{r}"
49
+ @dbi.execute "DELETE FROM items WHERE rss=?", r
50
+ }
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def update(rss_url, new_source, collection, response, rss_url_nice = rss_url)
57
+ rss = MRSS.parse(response)
58
+
59
+ begin @dbi.transaction do
60
+ # update source
61
+ if new_source
62
+ @dbi.execute "INSERT INTO sources (collection, rss, last, title, link, description) VALUES (?, ?, ?, ?, ?, ?)",
63
+ collection, rss_url, response['Last-Modified'], rss.title, rss.link, rss.description
64
+ info rss_url_nice + "Added as source"
65
+ else
66
+ @dbi.execute "UPDATE sources SET last=?, title=?, link=?, description=? WHERE collection=? AND rss=?",
67
+ response['Last-Modified'], rss.title, rss.link, rss.description, collection, rss_url
68
+ debug rss_url_nice + "Source updated"
69
+ end
70
+
71
+ # update items
72
+ items_new, items_updated = 0, 0
73
+ rss.items.each { |item|
74
+ description = item.description
75
+
76
+ # Link mangling
77
+ begin
78
+ link = URI::join((rss.link.to_s == '') ? uri.to_s : rss.link.to_s, item.link || rss.link).to_s
79
+ rescue URI::Error
80
+ link = item.link
81
+ end
82
+
83
+ # Push into database
84
+ db_title, = *@dbi.execute("SELECT title FROM items WHERE rss=? AND link=?", rss_url, link).fetch
85
+
86
+ if db_title.nil? || db_title.empty? # item is new
87
+ begin
88
+ @dbi.execute "INSERT INTO items (rss, title, link, date, description) VALUES (?, ?, ?, ?, ?)",
89
+ rss_url, item.title, link, item.date.to_s, description
90
+ items_new += 1
91
+ #rescue DBI::ProgrammingError
92
+ # puts description
93
+ # puts "#{$!.class}: #{$!}\n#{$!.backtrace.join("\n")}"
94
+ end
95
+ else
96
+ @dbi.execute "UPDATE items SET title=?, description=? WHERE rss=? AND link=?",
97
+ item.title, description, rss_url, link
98
+ items_updated += 1
99
+ end
100
+
101
+ # Remove all enclosures
102
+ @dbi.execute "DELETE FROM enclosures WHERE rss=? AND link=?", rss_url, link
103
+
104
+ # Re-add all enclosures
105
+ item.enclosures.each do |enclosure|
106
+ href = URI::join((rss.link.to_s == '') ? link.to_s : rss.link.to_s, enclosure['href']).to_s
107
+ @dbi.execute "INSERT INTO enclosures (rss, link, href, mime, title, length) VALUES (?, ?, ?, ?, ?, ?)",
108
+ rss_url, link, href, enclosure['type'], enclosure['title'],
109
+ !enclosure['length'] || enclosure['length'].empty? ? 0 : enclosure['length']
110
+ end
111
+ }
112
+ info rss_url_nice + "#{ items_new } new items, #{ items_updated } updated"
113
+ end; end
114
+ end
115
+
116
+ def sql_queries(task)
117
+ Dir[ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }*.sql" ].each
118
+ end
119
+
120
+ def sql_query(task)
121
+ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }.sql"
122
+ end
123
+ end
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative '../harvester'
4
+ require_relative 'db'
5
+ require_relative 'mrss'
6
+
7
+ require 'eventmachine'
8
+ require 'em-http'
9
+ require 'uri'
10
+
11
+ class Harvester
12
+ module FETCH; end
13
+
14
+ # fetches new feed updates and store them in the database
15
+ def fetch!
16
+ info "FETCH"
17
+ maintenance! unless @settings['no-maintenance']
18
+ Fetcher.run @dbi, @collections, @settings, @logger do |*args| update(*args) end # results will be passed to the update function
19
+ end
20
+ end
21
+
22
+ # fetches new feed updates and store them in the database using Eventmachine
23
+ module Harvester::Fetcher
24
+ def self.run(dbi, collections, settings, logger)
25
+ logger.info '[start] fetch using Eventmachine'
26
+
27
+ # prepare logger
28
+ max_url_size = collections.inject(0){ |acc, (_,rss_urls)| # log display hack
29
+ if rss_urls
30
+ max = rss_urls.max_by(&:size).size.to_i
31
+ acc > max ? acc : max
32
+ else
33
+ acc
34
+ end
35
+ }
36
+
37
+ #dbi['AutoCommit'] = false # TODO check for rdbi
38
+
39
+ EventMachine.run do
40
+ pending = []
41
+ collections.each{ |collection, rss_urls|
42
+ rss_urls and rss_urls.each{ |rss_url|
43
+ # prepare log prefix
44
+ rss_url_nice = '* ' + rss_url.ljust(max_url_size) + ' | '
45
+
46
+ # get last_modified or if new
47
+ db_rss, last = dbi.execute("SELECT rss, last FROM sources WHERE collection=? AND rss=?",
48
+ collection, rss_url).fetch
49
+ new_source = db_rss.nil? || db_rss.empty?
50
+ uri = URI.parse rss_url
51
+
52
+ # prepare request
53
+ header = {}
54
+ header['Authorization'] = [uri.user, uri.password] if uri.user
55
+
56
+ if new_source || last.nil?
57
+ logger.info rss_url_nice + "GET"
58
+ else
59
+ logger.info rss_url_nice + "GET with If-Modified-Since: #{ last }"
60
+ header['If-Modified-Since'] = last
61
+ end
62
+
63
+ # do request
64
+ pending << rss_url
65
+ http = EM::HttpRequest.new(uri).get :head => header
66
+
67
+ http.errback do
68
+ logger.error rss_url_nice + "Request Error: #{ http.error }"
69
+
70
+ pending.delete rss_url
71
+ EM.stop if pending.empty?
72
+ end
73
+
74
+ http.callback do
75
+ if http.response_header.status != 200
76
+ logger.warn rss_url_nice + "HTTP not OK, but: #{ http.response_header.status }"
77
+ elsif http.response.size > settings['size limit'].to_i
78
+ logger.warn rss_url_nice + "Got too big repsonse: #{ response.size } bytes"
79
+ else
80
+ yield rss_url, new_source, collection, http.response, rss_url_nice
81
+ end
82
+
83
+ pending.delete rss_url # same url twice?
84
+ EM.stop if pending.empty?
85
+ end
86
+ }
87
+ }
88
+
89
+ EM.add_timer(settings['timeout'].to_i){
90
+ pending.each { |rss_url| logger.warn rss_url_nice + 'Timed out' }
91
+ EM.stop
92
+ }
93
+ end
94
+ logger.info '[done ] fetch using Eventmachine'
95
+ end
96
+ end