harvester 0.8.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/CHANGELOG.rdoc +45 -0
  2. data/README.rdoc +74 -0
  3. data/Rakefile +28 -0
  4. data/bin/harvester +13 -0
  5. data/bin/harvester-chart +5 -0
  6. data/bin/harvester-clock +35 -0
  7. data/bin/harvester-db +15 -0
  8. data/bin/harvester-fetch +5 -0
  9. data/bin/harvester-generate +5 -0
  10. data/bin/harvester-jabber +6 -0
  11. data/bin/harvester-new +25 -0
  12. data/bin/harvester-post +5 -0
  13. data/bin/harvester-run +14 -0
  14. data/collections.yaml +15 -0
  15. data/config.yaml +13 -0
  16. data/data/ent/HTMLlat1.ent +194 -0
  17. data/data/ent/HTMLspecial.ent +77 -0
  18. data/data/ent/HTMLsymbol.ent +241 -0
  19. data/data/sql/dbd-mysql-isotime.diff +11 -0
  20. data/data/sql/harvester-0.6-mysql.diff +59 -0
  21. data/data/sql/harvester-0.7-mysql.diff +39 -0
  22. data/data/sql/mysql/chart.sql +1 -0
  23. data/data/sql/mysql/create.table.enclosures.sql +9 -0
  24. data/data/sql/mysql/create.table.items.sql +8 -0
  25. data/data/sql/mysql/create.table.jabbersettings.sql +5 -0
  26. data/data/sql/mysql/create.table.jabbersubscriptions.sql +5 -0
  27. data/data/sql/mysql/create.table.sources.sql +9 -0
  28. data/data/sql/mysql/create.view.last48hours.sql +1 -0
  29. data/data/sql/postgresql/chart.sql +1 -0
  30. data/data/sql/postgresql/create.table.enclosures.sql +9 -0
  31. data/data/sql/postgresql/create.table.items.sql +8 -0
  32. data/data/sql/postgresql/create.table.jabbersettings.sql +5 -0
  33. data/data/sql/postgresql/create.table.jabbersubscriptions.sql +5 -0
  34. data/data/sql/postgresql/create.table.sources.sql +9 -0
  35. data/data/sql/postgresql/create.view.last48hours.sql +1 -0
  36. data/data/sql/sqlite3/chart.sql +1 -0
  37. data/data/sql/sqlite3/create.table.enclosures.sql +9 -0
  38. data/data/sql/sqlite3/create.table.items.sql +8 -0
  39. data/data/sql/sqlite3/create.table.jabbersettings.sql +5 -0
  40. data/data/sql/sqlite3/create.table.jabbersubscriptions.sql +5 -0
  41. data/data/sql/sqlite3/create.table.sources.sql +9 -0
  42. data/data/sql/sqlite3/create.view.last48hours.sql +1 -0
  43. data/data/templates/atom-all.xml +88 -0
  44. data/data/templates/atom.xml +88 -0
  45. data/data/templates/index.html +412 -0
  46. data/data/templates/rss-all.rdf +86 -0
  47. data/data/templates/rss.rdf +85 -0
  48. data/data/templates/static/harvester.css +365 -0
  49. data/data/templates/static/harvester.gif +0 -0
  50. data/data/templates/static/harvester_ie7.css +15 -0
  51. data/data/templates/static/harvester_lte_ie6.css +27 -0
  52. data/harvester.gemspec +35 -0
  53. data/lib/harvester.rb +132 -0
  54. data/lib/harvester/chart.rb +72 -0
  55. data/lib/harvester/db.rb +123 -0
  56. data/lib/harvester/fetch.rb +96 -0
  57. data/lib/harvester/generate.rb +152 -0
  58. data/lib/harvester/generator/entity_translator.rb +46 -0
  59. data/lib/harvester/generator/link_absolutizer.rb +39 -0
  60. data/lib/harvester/jabber.rb +443 -0
  61. data/lib/harvester/mrss.rb +355 -0
  62. data/lib/harvester/post.rb +19 -0
  63. metadata +237 -0
Binary file
@@ -0,0 +1,15 @@
1
+ /*
2
+ * harvester_ie7.css
3
+ *
4
+ * Description: Anpassungen des Defaultstylesheets 'harvester.css'
5
+ für IE7
6
+ *
7
+ */
8
+
9
+ /* ------------------------------------------------------ */
10
+ /* Blogeintraege */
11
+ /* ------------------------------------------------------ */
12
+
13
+ .entry-content pre, .entry-content code {
14
+ overflow: scroll;
15
+ }
@@ -0,0 +1,27 @@
1
+ /*
2
+ * harvester_lte_ie6.css
3
+ *
4
+ * Description: Anpassungen des Defaultstylesheets 'harvester.css'
5
+ für IE6 und aelter Versionen
6
+ *
7
+ */
8
+
9
+ /* ------------------------------------------------------ */
10
+ /* Blogeintraege */
11
+ /* ------------------------------------------------------ */
12
+
13
+ .entry {
14
+ width: 100%;
15
+ /*width: 98%;*/ /* dirty hack */
16
+ }
17
+
18
+ .entry-content pre,
19
+ .entry-content code {
20
+ width: 90%; /* dirty hack */
21
+ overflow: scroll;
22
+ }
23
+
24
+ .tickers3 .ticker ul li {
25
+ padding-bottom: 1em;
26
+ width: 100%;
27
+ }
data/harvester.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # encoding: utf-8
2
+ require 'rubygems' unless defined? Gem
3
+ require File.dirname(__FILE__) + "/lib/harvester"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "harvester"
7
+ s.version = Harvester::VERSION
8
+ s.authors = ["astro", "Neingeist", "Tigion", "Josef Spillner", "Jan Lelis"]
9
+ s.email = "mail@janlelis.de"
10
+ s.homepage = "https://github.com/janlelis/harvester"
11
+ s.summary = "Web-based feed aggregator"
12
+ s.description = "The harvester collects your favourite feeds and generates static html/feed pages"
13
+ s.required_ruby_version = ">= 1.9.2"
14
+ s.required_rubygems_version = ">= 1.3.6"
15
+ # main
16
+ s.add_dependency 'rdbi'
17
+ s.add_dependency 'rdbi-driver-sqlite3'
18
+ s.add_dependency 'logger-colors'
19
+ # fetch
20
+ s.add_dependency 'eventmachine'
21
+ s.add_dependency 'em-http-request'
22
+ # generate
23
+ s.add_dependency 'ruby-xslt'
24
+ s.add_dependency 'hpricot'
25
+ # chart
26
+ s.add_dependency 'rmagick'
27
+ s.add_dependency 'gruff'
28
+ # clock
29
+ s.add_dependency 'clockwork'
30
+ # jabber
31
+ # s.add_dependency 'xmpp4r'
32
+ s.files = Dir.glob(%w|lib/**/*.rb bin/* [A-Z]*.{txt,rdoc} data/**/* *.yaml|) + %w|Rakefile harvester.gemspec|
33
+ s.executables = Dir['bin/*'].map{|f| File.basename f }
34
+ s.license = "AGPL"
35
+ end
data/lib/harvester.rb ADDED
@@ -0,0 +1,132 @@
1
+ require 'rubygems' unless defined? Gem
2
+ require 'rdbi'
3
+ require 'yaml'
4
+ require 'logger/colors'
5
+
6
+ class Harvester
7
+ VERSION = '0.8.0.pre.1'
8
+
9
+ attr_reader :config, :settings, :collections, :dbi, :logger
10
+
11
+ # takes an options string (to which overwrites settings) and creates a new Harvester instance
12
+ def initialize(options = {})
13
+ # command line options
14
+ options['config'] ||= './config.yaml'
15
+
16
+ # load config
17
+ begin
18
+ config_path = File.expand_path( options.delete('config') )
19
+ @config = YAML::load_file config_path
20
+ rescue Errno::ENOENT
21
+ raise LoadError, "Could not find a yaml config file at #{ config_path }"
22
+ end
23
+
24
+ # instance variable helpers
25
+ @settings = {
26
+ 'collections' => 'collections.yaml',
27
+ 'timeout' => 90,
28
+ 'size limit' => 200_000,
29
+ 'log_level' => Logger::DEBUG, # 0
30
+ 'log_file' => STDOUT,
31
+ }
32
+ @settings.merge! @config['settings']
33
+ @settings.merge! options
34
+
35
+ # load collections
36
+ begin
37
+ @collections = YAML::load_file @settings['collections']
38
+ rescue Errno::ENOENT
39
+ raise LoadError, "Could not find a yaml collections file at #{ File.expand_path( options[:config] ) }"
40
+ end
41
+
42
+ # init logger
43
+ @logger = Logger.new(
44
+ if !@settings['log_file'] || @settings['log_file'] =~ /^STD(?:OUT|ERR)$/
45
+ Object.const_get(@settings['log_file'])
46
+ else
47
+ @settings['log_file']
48
+ end
49
+ )
50
+ @logger.formatter = proc { |level, datetime, appname, msg| "#{msg}\n" }
51
+ @logger.level = if %w[debug info warn error fatal].include?(@settings['log_level'].to_s.downcase)
52
+ Logger::Severity.const_get(@settings['log_level'].to_s.upcase)
53
+ else
54
+ @settings['log_level'].to_i
55
+ end
56
+
57
+ # connect to db
58
+ begin
59
+ require 'rdbi/driver/' + config['db']['driver'].downcase # FIXME?
60
+
61
+ @dbi = RDBI::connect config['db']['driver'],
62
+ database: config['db']['database'],
63
+ user: config['db']['user'],
64
+ password: config['db']['password']
65
+ rescue Exception
66
+ error 'Something is wrong with your database settings:'
67
+ raise
68
+ end
69
+ end
70
+
71
+ # creates a new harvester using the command-line options to configure it
72
+ def self.new_from_argv
73
+ options = {}
74
+
75
+ require 'optparse'
76
+ OptionParser.new do |op|
77
+ op.banner = %q{USAGE:
78
+ harvester <COMMAND> [OPTIONS]
79
+ COMMANDS:
80
+ run run a complete harvester update
81
+ fetch run only the fetch script
82
+ generate run only the generate script
83
+ chart run only the generate chart script
84
+ post run only the post processing script
85
+ db start a database task (create or maintenance)
86
+ clock start the scheduler (cron replacement)
87
+ new create a new harvester project
88
+ jabber start the jabber bot (not implemented yet)
89
+ OPTIONS:} # automatically added as --help
90
+ op.on('-v', '--version') do
91
+ puts Harvester::VERSION
92
+ exit
93
+ end
94
+ op.on('-c', '--config FILE') do |config|
95
+ options['config'] = config
96
+ end
97
+ op.on('-l', '--log_file FILE') do |log_file|
98
+ options['log_file'] = log_file
99
+ end
100
+ op.on('-L', '--log_level NUMBER') do |log_level|
101
+ options['log_level'] = log_level
102
+ end
103
+ op.on('-p', '--post_script FILE') do |post_script|
104
+ options['post_script'] = post_script
105
+ end
106
+ op.on('-m', '--no-maintenance') do
107
+ options['no-maintenance'] = true
108
+ end
109
+ op.on('-s', '--no-chart') do
110
+ options['no-chart'] = true
111
+ end
112
+ end.parse!
113
+
114
+ Harvester.new options
115
+ end
116
+
117
+ protected
118
+
119
+ # logger helpers
120
+ def debug(msg) @logger.debug(msg) end
121
+ def info(msg) @logger.info(msg) end
122
+ def warn(msg) @logger.warn(msg) end
123
+ def error(msg) @logger.error(msg) end
124
+ def fatal(msg) @logger.fatal(msg) end
125
+
126
+ # adds an info message before and after the block
127
+ def task(msg) # MAYBE: nested spaces+behaviour
128
+ info "[start] " + msg
129
+ yield
130
+ info "[done ] " + msg
131
+ end
132
+ end
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative '../harvester'
4
+ require_relative '../harvester/db'
5
+ require 'gruff'
6
+
7
+ class Harvester
8
+ module CHART; end
9
+
10
+ # generates a fetch statistic image
11
+ def chart!
12
+ info "CHART"
13
+ task "generate chart" do
14
+ c = Chart::StatsPerCollection.new
15
+ @dbi.execute( File.read( sql_query(:chart) ) ).each{ |date,collection|
16
+ c.add_one(collection, Date.parse(date).day)
17
+ }
18
+ Chart.new(c).write File.join( @config['settings']['output'], '/chart.jpg' )
19
+ end
20
+ end
21
+ end
22
+
23
+ # generates a fetch statistics image using gruff
24
+ class Harvester::Chart
25
+ def initialize(stats, options = {}) # TODO configure g with options
26
+ @g = Gruff::Line.new(300)
27
+ @g.title = "Harvested items per day"
28
+ @g.x_axis_label = "Days"
29
+ @g.y_axis_label = "Items"
30
+
31
+ stats.each(&@g.method(:data))
32
+
33
+ labels = {}
34
+ stats.days.each_with_index do |d,i|
35
+ labels[i] = d.to_s
36
+ end
37
+ @g.labels = labels
38
+ end
39
+
40
+ def write(path)
41
+ @g.write(path)
42
+ end
43
+ end
44
+
45
+ class Harvester::Chart::StatsPerCollection
46
+ attr_reader :days
47
+
48
+ def initialize
49
+ @collections = {}
50
+ @days = []
51
+ end
52
+
53
+ def add_one(collection, day)
54
+ @days << day unless @days.index(day)
55
+ collection ||= '(unknown)' # TODO research
56
+
57
+ c = @collections[collection] || {}
58
+ c[day] = (c[day] || 0) + 1
59
+ @collections[collection] = c
60
+ end
61
+
62
+ def each
63
+ @collections.each { |n,c|
64
+ v = []
65
+ @days.each { |d|
66
+ v << c[d].to_i
67
+ }
68
+
69
+ yield n, v
70
+ }
71
+ end
72
+ end
@@ -0,0 +1,123 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative '../harvester'
4
+
5
+ class Harvester
6
+ module DB; end
7
+
8
+ # creates required database structure
9
+ def create!
10
+ task "create database tables" do
11
+ begin @dbi.transaction do
12
+ sql_queries(:create).each{ |sql|
13
+ info "* execute " + File.basename(sql)
14
+ @dbi.execute File.read(sql)
15
+ }
16
+ end; end
17
+ end
18
+ end
19
+
20
+ # check for feed source changes
21
+ def maintenance!
22
+ task "look for sources to purge" do
23
+ purge = []
24
+ @dbi.execute("SELECT collection, rss FROM sources").each{ |dbc,dbr|
25
+ purge << [dbc, dbr] unless (@collections[dbc] || []).include? dbr
26
+ }
27
+
28
+ purge_rss = []
29
+ purge.each { |c,r|
30
+ info "* remove #{c}:#{r}..."
31
+ @dbi.execute "DELETE FROM sources WHERE collection=? AND rss=?", c, r
32
+ purge_rss << r
33
+ }
34
+
35
+ purge_rss.delete_if { |r|
36
+ purge_this = true
37
+
38
+ @collections.each { |cfc,cfr|
39
+ if purge_this
40
+ warn "* must keep #{r} because it's still in #{cfc}" if cfr && cfr.include?(r)
41
+ purge_this = !(cfr && cfr.include?(r))
42
+ end
43
+ }
44
+
45
+ !purge_this
46
+ }
47
+ purge_rss.each { |r|
48
+ info "* purge items from feed #{r}"
49
+ @dbi.execute "DELETE FROM items WHERE rss=?", r
50
+ }
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def update(rss_url, new_source, collection, response, rss_url_nice = rss_url)
57
+ rss = MRSS.parse(response)
58
+
59
+ begin @dbi.transaction do
60
+ # update source
61
+ if new_source
62
+ @dbi.execute "INSERT INTO sources (collection, rss, last, title, link, description) VALUES (?, ?, ?, ?, ?, ?)",
63
+ collection, rss_url, response['Last-Modified'], rss.title, rss.link, rss.description
64
+ info rss_url_nice + "Added as source"
65
+ else
66
+ @dbi.execute "UPDATE sources SET last=?, title=?, link=?, description=? WHERE collection=? AND rss=?",
67
+ response['Last-Modified'], rss.title, rss.link, rss.description, collection, rss_url
68
+ debug rss_url_nice + "Source updated"
69
+ end
70
+
71
+ # update items
72
+ items_new, items_updated = 0, 0
73
+ rss.items.each { |item|
74
+ description = item.description
75
+
76
+ # Link mangling
77
+ begin
78
+ link = URI::join((rss.link.to_s == '') ? uri.to_s : rss.link.to_s, item.link || rss.link).to_s
79
+ rescue URI::Error
80
+ link = item.link
81
+ end
82
+
83
+ # Push into database
84
+ db_title, = *@dbi.execute("SELECT title FROM items WHERE rss=? AND link=?", rss_url, link).fetch
85
+
86
+ if db_title.nil? || db_title.empty? # item is new
87
+ begin
88
+ @dbi.execute "INSERT INTO items (rss, title, link, date, description) VALUES (?, ?, ?, ?, ?)",
89
+ rss_url, item.title, link, item.date.to_s, description
90
+ items_new += 1
91
+ #rescue DBI::ProgrammingError
92
+ # puts description
93
+ # puts "#{$!.class}: #{$!}\n#{$!.backtrace.join("\n")}"
94
+ end
95
+ else
96
+ @dbi.execute "UPDATE items SET title=?, description=? WHERE rss=? AND link=?",
97
+ item.title, description, rss_url, link
98
+ items_updated += 1
99
+ end
100
+
101
+ # Remove all enclosures
102
+ @dbi.execute "DELETE FROM enclosures WHERE rss=? AND link=?", rss_url, link
103
+
104
+ # Re-add all enclosures
105
+ item.enclosures.each do |enclosure|
106
+ href = URI::join((rss.link.to_s == '') ? link.to_s : rss.link.to_s, enclosure['href']).to_s
107
+ @dbi.execute "INSERT INTO enclosures (rss, link, href, mime, title, length) VALUES (?, ?, ?, ?, ?, ?)",
108
+ rss_url, link, href, enclosure['type'], enclosure['title'],
109
+ !enclosure['length'] || enclosure['length'].empty? ? 0 : enclosure['length']
110
+ end
111
+ }
112
+ info rss_url_nice + "#{ items_new } new items, #{ items_updated } updated"
113
+ end; end
114
+ end
115
+
116
+ def sql_queries(task)
117
+ Dir[ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }*.sql" ].each
118
+ end
119
+
120
+ def sql_query(task)
121
+ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }.sql"
122
+ end
123
+ end
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ require_relative '../harvester'
4
+ require_relative 'db'
5
+ require_relative 'mrss'
6
+
7
+ require 'eventmachine'
8
+ require 'em-http'
9
+ require 'uri'
10
+
11
+ class Harvester
12
+ module FETCH; end
13
+
14
+ # fetches new feed updates and store them in the database
15
+ def fetch!
16
+ info "FETCH"
17
+ maintenance! unless @settings['no-maintenance']
18
+ Fetcher.run @dbi, @collections, @settings, @logger do |*args| update(*args) end # results will be passed to the update function
19
+ end
20
+ end
21
+
22
+ # fetches new feed updates and store them in the database using Eventmachine
23
+ module Harvester::Fetcher
24
+ def self.run(dbi, collections, settings, logger)
25
+ logger.info '[start] fetch using Eventmachine'
26
+
27
+ # prepare logger
28
+ max_url_size = collections.inject(0){ |acc, (_,rss_urls)| # log display hack
29
+ if rss_urls
30
+ max = rss_urls.max_by(&:size).size.to_i
31
+ acc > max ? acc : max
32
+ else
33
+ acc
34
+ end
35
+ }
36
+
37
+ #dbi['AutoCommit'] = false # TODO check for rdbi
38
+
39
+ EventMachine.run do
40
+ pending = []
41
+ collections.each{ |collection, rss_urls|
42
+ rss_urls and rss_urls.each{ |rss_url|
43
+ # prepare log prefix
44
+ rss_url_nice = '* ' + rss_url.ljust(max_url_size) + ' | '
45
+
46
+ # get last_modified or if new
47
+ db_rss, last = dbi.execute("SELECT rss, last FROM sources WHERE collection=? AND rss=?",
48
+ collection, rss_url).fetch
49
+ new_source = db_rss.nil? || db_rss.empty?
50
+ uri = URI.parse rss_url
51
+
52
+ # prepare request
53
+ header = {}
54
+ header['Authorization'] = [uri.user, uri.password] if uri.user
55
+
56
+ if new_source || last.nil?
57
+ logger.info rss_url_nice + "GET"
58
+ else
59
+ logger.info rss_url_nice + "GET with If-Modified-Since: #{ last }"
60
+ header['If-Modified-Since'] = last
61
+ end
62
+
63
+ # do request
64
+ pending << rss_url
65
+ http = EM::HttpRequest.new(uri).get :head => header
66
+
67
+ http.errback do
68
+ logger.error rss_url_nice + "Request Error: #{ http.error }"
69
+
70
+ pending.delete rss_url
71
+ EM.stop if pending.empty?
72
+ end
73
+
74
+ http.callback do
75
+ if http.response_header.status != 200
76
+ logger.warn rss_url_nice + "HTTP not OK, but: #{ http.response_header.status }"
77
+ elsif http.response.size > settings['size limit'].to_i
78
+ logger.warn rss_url_nice + "Got too big repsonse: #{ response.size } bytes"
79
+ else
80
+ yield rss_url, new_source, collection, http.response, rss_url_nice
81
+ end
82
+
83
+ pending.delete rss_url # same url twice?
84
+ EM.stop if pending.empty?
85
+ end
86
+ }
87
+ }
88
+
89
+ EM.add_timer(settings['timeout'].to_i){
90
+ pending.each { |rss_url| logger.warn rss_url_nice + 'Timed out' }
91
+ EM.stop
92
+ }
93
+ end
94
+ logger.info '[done ] fetch using Eventmachine'
95
+ end
96
+ end