harvester 0.8.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +45 -0
- data/README.rdoc +74 -0
- data/Rakefile +28 -0
- data/bin/harvester +13 -0
- data/bin/harvester-chart +5 -0
- data/bin/harvester-clock +35 -0
- data/bin/harvester-db +15 -0
- data/bin/harvester-fetch +5 -0
- data/bin/harvester-generate +5 -0
- data/bin/harvester-jabber +6 -0
- data/bin/harvester-new +25 -0
- data/bin/harvester-post +5 -0
- data/bin/harvester-run +14 -0
- data/collections.yaml +15 -0
- data/config.yaml +13 -0
- data/data/ent/HTMLlat1.ent +194 -0
- data/data/ent/HTMLspecial.ent +77 -0
- data/data/ent/HTMLsymbol.ent +241 -0
- data/data/sql/dbd-mysql-isotime.diff +11 -0
- data/data/sql/harvester-0.6-mysql.diff +59 -0
- data/data/sql/harvester-0.7-mysql.diff +39 -0
- data/data/sql/mysql/chart.sql +1 -0
- data/data/sql/mysql/create.table.enclosures.sql +9 -0
- data/data/sql/mysql/create.table.items.sql +8 -0
- data/data/sql/mysql/create.table.jabbersettings.sql +5 -0
- data/data/sql/mysql/create.table.jabbersubscriptions.sql +5 -0
- data/data/sql/mysql/create.table.sources.sql +9 -0
- data/data/sql/mysql/create.view.last48hours.sql +1 -0
- data/data/sql/postgresql/chart.sql +1 -0
- data/data/sql/postgresql/create.table.enclosures.sql +9 -0
- data/data/sql/postgresql/create.table.items.sql +8 -0
- data/data/sql/postgresql/create.table.jabbersettings.sql +5 -0
- data/data/sql/postgresql/create.table.jabbersubscriptions.sql +5 -0
- data/data/sql/postgresql/create.table.sources.sql +9 -0
- data/data/sql/postgresql/create.view.last48hours.sql +1 -0
- data/data/sql/sqlite3/chart.sql +1 -0
- data/data/sql/sqlite3/create.table.enclosures.sql +9 -0
- data/data/sql/sqlite3/create.table.items.sql +8 -0
- data/data/sql/sqlite3/create.table.jabbersettings.sql +5 -0
- data/data/sql/sqlite3/create.table.jabbersubscriptions.sql +5 -0
- data/data/sql/sqlite3/create.table.sources.sql +9 -0
- data/data/sql/sqlite3/create.view.last48hours.sql +1 -0
- data/data/templates/atom-all.xml +88 -0
- data/data/templates/atom.xml +88 -0
- data/data/templates/index.html +412 -0
- data/data/templates/rss-all.rdf +86 -0
- data/data/templates/rss.rdf +85 -0
- data/data/templates/static/harvester.css +365 -0
- data/data/templates/static/harvester.gif +0 -0
- data/data/templates/static/harvester_ie7.css +15 -0
- data/data/templates/static/harvester_lte_ie6.css +27 -0
- data/harvester.gemspec +35 -0
- data/lib/harvester.rb +132 -0
- data/lib/harvester/chart.rb +72 -0
- data/lib/harvester/db.rb +123 -0
- data/lib/harvester/fetch.rb +96 -0
- data/lib/harvester/generate.rb +152 -0
- data/lib/harvester/generator/entity_translator.rb +46 -0
- data/lib/harvester/generator/link_absolutizer.rb +39 -0
- data/lib/harvester/jabber.rb +443 -0
- data/lib/harvester/mrss.rb +355 -0
- data/lib/harvester/post.rb +19 -0
- metadata +237 -0
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
/*
|
2
|
+
* harvester_ie7.css
|
3
|
+
*
|
4
|
+
* Description: Anpassungen des Defaultstylesheets 'harvester.css'
|
5
|
+
für IE7
|
6
|
+
*
|
7
|
+
*/
|
8
|
+
|
9
|
+
/* ------------------------------------------------------ */
|
10
|
+
/* Blogeintraege */
|
11
|
+
/* ------------------------------------------------------ */
|
12
|
+
|
13
|
+
.entry-content pre, .entry-content code {
|
14
|
+
overflow: scroll;
|
15
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
/*
|
2
|
+
* harvester_lte_ie6.css
|
3
|
+
*
|
4
|
+
* Description: Anpassungen des Defaultstylesheets 'harvester.css'
|
5
|
+
für IE6 und aelter Versionen
|
6
|
+
*
|
7
|
+
*/
|
8
|
+
|
9
|
+
/* ------------------------------------------------------ */
|
10
|
+
/* Blogeintraege */
|
11
|
+
/* ------------------------------------------------------ */
|
12
|
+
|
13
|
+
.entry {
|
14
|
+
width: 100%;
|
15
|
+
/*width: 98%;*/ /* dirty hack */
|
16
|
+
}
|
17
|
+
|
18
|
+
.entry-content pre,
|
19
|
+
.entry-content code {
|
20
|
+
width: 90%; /* dirty hack */
|
21
|
+
overflow: scroll;
|
22
|
+
}
|
23
|
+
|
24
|
+
.tickers3 .ticker ul li {
|
25
|
+
padding-bottom: 1em;
|
26
|
+
width: 100%;
|
27
|
+
}
|
data/harvester.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rubygems' unless defined? Gem
|
3
|
+
require File.dirname(__FILE__) + "/lib/harvester"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "harvester"
|
7
|
+
s.version = Harvester::VERSION
|
8
|
+
s.authors = ["astro", "Neingeist", "Tigion", "Josef Spillner", "Jan Lelis"]
|
9
|
+
s.email = "mail@janlelis.de"
|
10
|
+
s.homepage = "https://github.com/janlelis/harvester"
|
11
|
+
s.summary = "Web-based feed aggregator"
|
12
|
+
s.description = "The harvester collects your favourite feeds and generates static html/feed pages"
|
13
|
+
s.required_ruby_version = ">= 1.9.2"
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
# main
|
16
|
+
s.add_dependency 'rdbi'
|
17
|
+
s.add_dependency 'rdbi-driver-sqlite3'
|
18
|
+
s.add_dependency 'logger-colors'
|
19
|
+
# fetch
|
20
|
+
s.add_dependency 'eventmachine'
|
21
|
+
s.add_dependency 'em-http-request'
|
22
|
+
# generate
|
23
|
+
s.add_dependency 'ruby-xslt'
|
24
|
+
s.add_dependency 'hpricot'
|
25
|
+
# chart
|
26
|
+
s.add_dependency 'rmagick'
|
27
|
+
s.add_dependency 'gruff'
|
28
|
+
# clock
|
29
|
+
s.add_dependency 'clockwork'
|
30
|
+
# jabber
|
31
|
+
# s.add_dependency 'xmpp4r'
|
32
|
+
s.files = Dir.glob(%w|lib/**/*.rb bin/* [A-Z]*.{txt,rdoc} data/**/* *.yaml|) + %w|Rakefile harvester.gemspec|
|
33
|
+
s.executables = Dir['bin/*'].map{|f| File.basename f }
|
34
|
+
s.license = "AGPL"
|
35
|
+
end
|
data/lib/harvester.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'rubygems' unless defined? Gem
|
2
|
+
require 'rdbi'
|
3
|
+
require 'yaml'
|
4
|
+
require 'logger/colors'
|
5
|
+
|
6
|
+
class Harvester
|
7
|
+
VERSION = '0.8.0.pre.1'
|
8
|
+
|
9
|
+
attr_reader :config, :settings, :collections, :dbi, :logger
|
10
|
+
|
11
|
+
# takes an options string (to which overwrites settings) and creates a new Harvester instance
|
12
|
+
def initialize(options = {})
|
13
|
+
# command line options
|
14
|
+
options['config'] ||= './config.yaml'
|
15
|
+
|
16
|
+
# load config
|
17
|
+
begin
|
18
|
+
config_path = File.expand_path( options.delete('config') )
|
19
|
+
@config = YAML::load_file config_path
|
20
|
+
rescue Errno::ENOENT
|
21
|
+
raise LoadError, "Could not find a yaml config file at #{ config_path }"
|
22
|
+
end
|
23
|
+
|
24
|
+
# instance variable helpers
|
25
|
+
@settings = {
|
26
|
+
'collections' => 'collections.yaml',
|
27
|
+
'timeout' => 90,
|
28
|
+
'size limit' => 200_000,
|
29
|
+
'log_level' => Logger::DEBUG, # 0
|
30
|
+
'log_file' => STDOUT,
|
31
|
+
}
|
32
|
+
@settings.merge! @config['settings']
|
33
|
+
@settings.merge! options
|
34
|
+
|
35
|
+
# load collections
|
36
|
+
begin
|
37
|
+
@collections = YAML::load_file @settings['collections']
|
38
|
+
rescue Errno::ENOENT
|
39
|
+
raise LoadError, "Could not find a yaml collections file at #{ File.expand_path( options[:config] ) }"
|
40
|
+
end
|
41
|
+
|
42
|
+
# init logger
|
43
|
+
@logger = Logger.new(
|
44
|
+
if !@settings['log_file'] || @settings['log_file'] =~ /^STD(?:OUT|ERR)$/
|
45
|
+
Object.const_get(@settings['log_file'])
|
46
|
+
else
|
47
|
+
@settings['log_file']
|
48
|
+
end
|
49
|
+
)
|
50
|
+
@logger.formatter = proc { |level, datetime, appname, msg| "#{msg}\n" }
|
51
|
+
@logger.level = if %w[debug info warn error fatal].include?(@settings['log_level'].to_s.downcase)
|
52
|
+
Logger::Severity.const_get(@settings['log_level'].to_s.upcase)
|
53
|
+
else
|
54
|
+
@settings['log_level'].to_i
|
55
|
+
end
|
56
|
+
|
57
|
+
# connect to db
|
58
|
+
begin
|
59
|
+
require 'rdbi/driver/' + config['db']['driver'].downcase # FIXME?
|
60
|
+
|
61
|
+
@dbi = RDBI::connect config['db']['driver'],
|
62
|
+
database: config['db']['database'],
|
63
|
+
user: config['db']['user'],
|
64
|
+
password: config['db']['password']
|
65
|
+
rescue Exception
|
66
|
+
error 'Something is wrong with your database settings:'
|
67
|
+
raise
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# creates a new harvester using the command-line options to configure it
|
72
|
+
def self.new_from_argv
|
73
|
+
options = {}
|
74
|
+
|
75
|
+
require 'optparse'
|
76
|
+
OptionParser.new do |op|
|
77
|
+
op.banner = %q{USAGE:
|
78
|
+
harvester <COMMAND> [OPTIONS]
|
79
|
+
COMMANDS:
|
80
|
+
run run a complete harvester update
|
81
|
+
fetch run only the fetch script
|
82
|
+
generate run only the generate script
|
83
|
+
chart run only the generate chart script
|
84
|
+
post run only the post processing script
|
85
|
+
db start a database task (create or maintenance)
|
86
|
+
clock start the scheduler (cron replacement)
|
87
|
+
new create a new harvester project
|
88
|
+
jabber start the jabber bot (not implemented yet)
|
89
|
+
OPTIONS:} # automatically added as --help
|
90
|
+
op.on('-v', '--version') do
|
91
|
+
puts Harvester::VERSION
|
92
|
+
exit
|
93
|
+
end
|
94
|
+
op.on('-c', '--config FILE') do |config|
|
95
|
+
options['config'] = config
|
96
|
+
end
|
97
|
+
op.on('-l', '--log_file FILE') do |log_file|
|
98
|
+
options['log_file'] = log_file
|
99
|
+
end
|
100
|
+
op.on('-L', '--log_level NUMBER') do |log_level|
|
101
|
+
options['log_level'] = log_level
|
102
|
+
end
|
103
|
+
op.on('-p', '--post_script FILE') do |post_script|
|
104
|
+
options['post_script'] = post_script
|
105
|
+
end
|
106
|
+
op.on('-m', '--no-maintenance') do
|
107
|
+
options['no-maintenance'] = true
|
108
|
+
end
|
109
|
+
op.on('-s', '--no-chart') do
|
110
|
+
options['no-chart'] = true
|
111
|
+
end
|
112
|
+
end.parse!
|
113
|
+
|
114
|
+
Harvester.new options
|
115
|
+
end
|
116
|
+
|
117
|
+
protected
|
118
|
+
|
119
|
+
# logger helpers
|
120
|
+
def debug(msg) @logger.debug(msg) end
|
121
|
+
def info(msg) @logger.info(msg) end
|
122
|
+
def warn(msg) @logger.warn(msg) end
|
123
|
+
def error(msg) @logger.error(msg) end
|
124
|
+
def fatal(msg) @logger.fatal(msg) end
|
125
|
+
|
126
|
+
# adds an info message before and after the block
|
127
|
+
def task(msg) # MAYBE: nested spaces+behaviour
|
128
|
+
info "[start] " + msg
|
129
|
+
yield
|
130
|
+
info "[done ] " + msg
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
require_relative '../harvester/db'
|
5
|
+
require 'gruff'
|
6
|
+
|
7
|
+
class Harvester
|
8
|
+
module CHART; end
|
9
|
+
|
10
|
+
# generates a fetch statistic image
|
11
|
+
def chart!
|
12
|
+
info "CHART"
|
13
|
+
task "generate chart" do
|
14
|
+
c = Chart::StatsPerCollection.new
|
15
|
+
@dbi.execute( File.read( sql_query(:chart) ) ).each{ |date,collection|
|
16
|
+
c.add_one(collection, Date.parse(date).day)
|
17
|
+
}
|
18
|
+
Chart.new(c).write File.join( @config['settings']['output'], '/chart.jpg' )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# generates a fetch statistics image using gruff
|
24
|
+
class Harvester::Chart
|
25
|
+
def initialize(stats, options = {}) # TODO configure g with options
|
26
|
+
@g = Gruff::Line.new(300)
|
27
|
+
@g.title = "Harvested items per day"
|
28
|
+
@g.x_axis_label = "Days"
|
29
|
+
@g.y_axis_label = "Items"
|
30
|
+
|
31
|
+
stats.each(&@g.method(:data))
|
32
|
+
|
33
|
+
labels = {}
|
34
|
+
stats.days.each_with_index do |d,i|
|
35
|
+
labels[i] = d.to_s
|
36
|
+
end
|
37
|
+
@g.labels = labels
|
38
|
+
end
|
39
|
+
|
40
|
+
def write(path)
|
41
|
+
@g.write(path)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class Harvester::Chart::StatsPerCollection
|
46
|
+
attr_reader :days
|
47
|
+
|
48
|
+
def initialize
|
49
|
+
@collections = {}
|
50
|
+
@days = []
|
51
|
+
end
|
52
|
+
|
53
|
+
def add_one(collection, day)
|
54
|
+
@days << day unless @days.index(day)
|
55
|
+
collection ||= '(unknown)' # TODO research
|
56
|
+
|
57
|
+
c = @collections[collection] || {}
|
58
|
+
c[day] = (c[day] || 0) + 1
|
59
|
+
@collections[collection] = c
|
60
|
+
end
|
61
|
+
|
62
|
+
def each
|
63
|
+
@collections.each { |n,c|
|
64
|
+
v = []
|
65
|
+
@days.each { |d|
|
66
|
+
v << c[d].to_i
|
67
|
+
}
|
68
|
+
|
69
|
+
yield n, v
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end
|
data/lib/harvester/db.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
|
5
|
+
class Harvester
|
6
|
+
module DB; end
|
7
|
+
|
8
|
+
# creates required database structure
|
9
|
+
def create!
|
10
|
+
task "create database tables" do
|
11
|
+
begin @dbi.transaction do
|
12
|
+
sql_queries(:create).each{ |sql|
|
13
|
+
info "* execute " + File.basename(sql)
|
14
|
+
@dbi.execute File.read(sql)
|
15
|
+
}
|
16
|
+
end; end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# check for feed source changes
|
21
|
+
def maintenance!
|
22
|
+
task "look for sources to purge" do
|
23
|
+
purge = []
|
24
|
+
@dbi.execute("SELECT collection, rss FROM sources").each{ |dbc,dbr|
|
25
|
+
purge << [dbc, dbr] unless (@collections[dbc] || []).include? dbr
|
26
|
+
}
|
27
|
+
|
28
|
+
purge_rss = []
|
29
|
+
purge.each { |c,r|
|
30
|
+
info "* remove #{c}:#{r}..."
|
31
|
+
@dbi.execute "DELETE FROM sources WHERE collection=? AND rss=?", c, r
|
32
|
+
purge_rss << r
|
33
|
+
}
|
34
|
+
|
35
|
+
purge_rss.delete_if { |r|
|
36
|
+
purge_this = true
|
37
|
+
|
38
|
+
@collections.each { |cfc,cfr|
|
39
|
+
if purge_this
|
40
|
+
warn "* must keep #{r} because it's still in #{cfc}" if cfr && cfr.include?(r)
|
41
|
+
purge_this = !(cfr && cfr.include?(r))
|
42
|
+
end
|
43
|
+
}
|
44
|
+
|
45
|
+
!purge_this
|
46
|
+
}
|
47
|
+
purge_rss.each { |r|
|
48
|
+
info "* purge items from feed #{r}"
|
49
|
+
@dbi.execute "DELETE FROM items WHERE rss=?", r
|
50
|
+
}
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def update(rss_url, new_source, collection, response, rss_url_nice = rss_url)
|
57
|
+
rss = MRSS.parse(response)
|
58
|
+
|
59
|
+
begin @dbi.transaction do
|
60
|
+
# update source
|
61
|
+
if new_source
|
62
|
+
@dbi.execute "INSERT INTO sources (collection, rss, last, title, link, description) VALUES (?, ?, ?, ?, ?, ?)",
|
63
|
+
collection, rss_url, response['Last-Modified'], rss.title, rss.link, rss.description
|
64
|
+
info rss_url_nice + "Added as source"
|
65
|
+
else
|
66
|
+
@dbi.execute "UPDATE sources SET last=?, title=?, link=?, description=? WHERE collection=? AND rss=?",
|
67
|
+
response['Last-Modified'], rss.title, rss.link, rss.description, collection, rss_url
|
68
|
+
debug rss_url_nice + "Source updated"
|
69
|
+
end
|
70
|
+
|
71
|
+
# update items
|
72
|
+
items_new, items_updated = 0, 0
|
73
|
+
rss.items.each { |item|
|
74
|
+
description = item.description
|
75
|
+
|
76
|
+
# Link mangling
|
77
|
+
begin
|
78
|
+
link = URI::join((rss.link.to_s == '') ? uri.to_s : rss.link.to_s, item.link || rss.link).to_s
|
79
|
+
rescue URI::Error
|
80
|
+
link = item.link
|
81
|
+
end
|
82
|
+
|
83
|
+
# Push into database
|
84
|
+
db_title, = *@dbi.execute("SELECT title FROM items WHERE rss=? AND link=?", rss_url, link).fetch
|
85
|
+
|
86
|
+
if db_title.nil? || db_title.empty? # item is new
|
87
|
+
begin
|
88
|
+
@dbi.execute "INSERT INTO items (rss, title, link, date, description) VALUES (?, ?, ?, ?, ?)",
|
89
|
+
rss_url, item.title, link, item.date.to_s, description
|
90
|
+
items_new += 1
|
91
|
+
#rescue DBI::ProgrammingError
|
92
|
+
# puts description
|
93
|
+
# puts "#{$!.class}: #{$!}\n#{$!.backtrace.join("\n")}"
|
94
|
+
end
|
95
|
+
else
|
96
|
+
@dbi.execute "UPDATE items SET title=?, description=? WHERE rss=? AND link=?",
|
97
|
+
item.title, description, rss_url, link
|
98
|
+
items_updated += 1
|
99
|
+
end
|
100
|
+
|
101
|
+
# Remove all enclosures
|
102
|
+
@dbi.execute "DELETE FROM enclosures WHERE rss=? AND link=?", rss_url, link
|
103
|
+
|
104
|
+
# Re-add all enclosures
|
105
|
+
item.enclosures.each do |enclosure|
|
106
|
+
href = URI::join((rss.link.to_s == '') ? link.to_s : rss.link.to_s, enclosure['href']).to_s
|
107
|
+
@dbi.execute "INSERT INTO enclosures (rss, link, href, mime, title, length) VALUES (?, ?, ?, ?, ?, ?)",
|
108
|
+
rss_url, link, href, enclosure['type'], enclosure['title'],
|
109
|
+
!enclosure['length'] || enclosure['length'].empty? ? 0 : enclosure['length']
|
110
|
+
end
|
111
|
+
}
|
112
|
+
info rss_url_nice + "#{ items_new } new items, #{ items_updated } updated"
|
113
|
+
end; end
|
114
|
+
end
|
115
|
+
|
116
|
+
def sql_queries(task)
|
117
|
+
Dir[ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }*.sql" ].each
|
118
|
+
end
|
119
|
+
|
120
|
+
def sql_query(task)
|
121
|
+
File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }.sql"
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
require_relative 'db'
|
5
|
+
require_relative 'mrss'
|
6
|
+
|
7
|
+
require 'eventmachine'
|
8
|
+
require 'em-http'
|
9
|
+
require 'uri'
|
10
|
+
|
11
|
+
class Harvester
|
12
|
+
module FETCH; end
|
13
|
+
|
14
|
+
# fetches new feed updates and store them in the database
|
15
|
+
def fetch!
|
16
|
+
info "FETCH"
|
17
|
+
maintenance! unless @settings['no-maintenance']
|
18
|
+
Fetcher.run @dbi, @collections, @settings, @logger do |*args| update(*args) end # results will be passed to the update function
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# fetches new feed updates and store them in the database using Eventmachine
|
23
|
+
module Harvester::Fetcher
|
24
|
+
def self.run(dbi, collections, settings, logger)
|
25
|
+
logger.info '[start] fetch using Eventmachine'
|
26
|
+
|
27
|
+
# prepare logger
|
28
|
+
max_url_size = collections.inject(0){ |acc, (_,rss_urls)| # log display hack
|
29
|
+
if rss_urls
|
30
|
+
max = rss_urls.max_by(&:size).size.to_i
|
31
|
+
acc > max ? acc : max
|
32
|
+
else
|
33
|
+
acc
|
34
|
+
end
|
35
|
+
}
|
36
|
+
|
37
|
+
#dbi['AutoCommit'] = false # TODO check for rdbi
|
38
|
+
|
39
|
+
EventMachine.run do
|
40
|
+
pending = []
|
41
|
+
collections.each{ |collection, rss_urls|
|
42
|
+
rss_urls and rss_urls.each{ |rss_url|
|
43
|
+
# prepare log prefix
|
44
|
+
rss_url_nice = '* ' + rss_url.ljust(max_url_size) + ' | '
|
45
|
+
|
46
|
+
# get last_modified or if new
|
47
|
+
db_rss, last = dbi.execute("SELECT rss, last FROM sources WHERE collection=? AND rss=?",
|
48
|
+
collection, rss_url).fetch
|
49
|
+
new_source = db_rss.nil? || db_rss.empty?
|
50
|
+
uri = URI.parse rss_url
|
51
|
+
|
52
|
+
# prepare request
|
53
|
+
header = {}
|
54
|
+
header['Authorization'] = [uri.user, uri.password] if uri.user
|
55
|
+
|
56
|
+
if new_source || last.nil?
|
57
|
+
logger.info rss_url_nice + "GET"
|
58
|
+
else
|
59
|
+
logger.info rss_url_nice + "GET with If-Modified-Since: #{ last }"
|
60
|
+
header['If-Modified-Since'] = last
|
61
|
+
end
|
62
|
+
|
63
|
+
# do request
|
64
|
+
pending << rss_url
|
65
|
+
http = EM::HttpRequest.new(uri).get :head => header
|
66
|
+
|
67
|
+
http.errback do
|
68
|
+
logger.error rss_url_nice + "Request Error: #{ http.error }"
|
69
|
+
|
70
|
+
pending.delete rss_url
|
71
|
+
EM.stop if pending.empty?
|
72
|
+
end
|
73
|
+
|
74
|
+
http.callback do
|
75
|
+
if http.response_header.status != 200
|
76
|
+
logger.warn rss_url_nice + "HTTP not OK, but: #{ http.response_header.status }"
|
77
|
+
elsif http.response.size > settings['size limit'].to_i
|
78
|
+
logger.warn rss_url_nice + "Got too big repsonse: #{ response.size } bytes"
|
79
|
+
else
|
80
|
+
yield rss_url, new_source, collection, http.response, rss_url_nice
|
81
|
+
end
|
82
|
+
|
83
|
+
pending.delete rss_url # same url twice?
|
84
|
+
EM.stop if pending.empty?
|
85
|
+
end
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
EM.add_timer(settings['timeout'].to_i){
|
90
|
+
pending.each { |rss_url| logger.warn rss_url_nice + 'Timed out' }
|
91
|
+
EM.stop
|
92
|
+
}
|
93
|
+
end
|
94
|
+
logger.info '[done ] fetch using Eventmachine'
|
95
|
+
end
|
96
|
+
end
|