harvester 0.8.0.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +45 -0
- data/README.rdoc +74 -0
- data/Rakefile +28 -0
- data/bin/harvester +13 -0
- data/bin/harvester-chart +5 -0
- data/bin/harvester-clock +35 -0
- data/bin/harvester-db +15 -0
- data/bin/harvester-fetch +5 -0
- data/bin/harvester-generate +5 -0
- data/bin/harvester-jabber +6 -0
- data/bin/harvester-new +25 -0
- data/bin/harvester-post +5 -0
- data/bin/harvester-run +14 -0
- data/collections.yaml +15 -0
- data/config.yaml +13 -0
- data/data/ent/HTMLlat1.ent +194 -0
- data/data/ent/HTMLspecial.ent +77 -0
- data/data/ent/HTMLsymbol.ent +241 -0
- data/data/sql/dbd-mysql-isotime.diff +11 -0
- data/data/sql/harvester-0.6-mysql.diff +59 -0
- data/data/sql/harvester-0.7-mysql.diff +39 -0
- data/data/sql/mysql/chart.sql +1 -0
- data/data/sql/mysql/create.table.enclosures.sql +9 -0
- data/data/sql/mysql/create.table.items.sql +8 -0
- data/data/sql/mysql/create.table.jabbersettings.sql +5 -0
- data/data/sql/mysql/create.table.jabbersubscriptions.sql +5 -0
- data/data/sql/mysql/create.table.sources.sql +9 -0
- data/data/sql/mysql/create.view.last48hours.sql +1 -0
- data/data/sql/postgresql/chart.sql +1 -0
- data/data/sql/postgresql/create.table.enclosures.sql +9 -0
- data/data/sql/postgresql/create.table.items.sql +8 -0
- data/data/sql/postgresql/create.table.jabbersettings.sql +5 -0
- data/data/sql/postgresql/create.table.jabbersubscriptions.sql +5 -0
- data/data/sql/postgresql/create.table.sources.sql +9 -0
- data/data/sql/postgresql/create.view.last48hours.sql +1 -0
- data/data/sql/sqlite3/chart.sql +1 -0
- data/data/sql/sqlite3/create.table.enclosures.sql +9 -0
- data/data/sql/sqlite3/create.table.items.sql +8 -0
- data/data/sql/sqlite3/create.table.jabbersettings.sql +5 -0
- data/data/sql/sqlite3/create.table.jabbersubscriptions.sql +5 -0
- data/data/sql/sqlite3/create.table.sources.sql +9 -0
- data/data/sql/sqlite3/create.view.last48hours.sql +1 -0
- data/data/templates/atom-all.xml +88 -0
- data/data/templates/atom.xml +88 -0
- data/data/templates/index.html +412 -0
- data/data/templates/rss-all.rdf +86 -0
- data/data/templates/rss.rdf +85 -0
- data/data/templates/static/harvester.css +365 -0
- data/data/templates/static/harvester.gif +0 -0
- data/data/templates/static/harvester_ie7.css +15 -0
- data/data/templates/static/harvester_lte_ie6.css +27 -0
- data/harvester.gemspec +35 -0
- data/lib/harvester.rb +132 -0
- data/lib/harvester/chart.rb +72 -0
- data/lib/harvester/db.rb +123 -0
- data/lib/harvester/fetch.rb +96 -0
- data/lib/harvester/generate.rb +152 -0
- data/lib/harvester/generator/entity_translator.rb +46 -0
- data/lib/harvester/generator/link_absolutizer.rb +39 -0
- data/lib/harvester/jabber.rb +443 -0
- data/lib/harvester/mrss.rb +355 -0
- data/lib/harvester/post.rb +19 -0
- metadata +237 -0
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
/*
|
2
|
+
* harvester_ie7.css
|
3
|
+
*
|
4
|
+
* Description: Anpassungen des Defaultstylesheets 'harvester.css'
|
5
|
+
für IE7
|
6
|
+
*
|
7
|
+
*/
|
8
|
+
|
9
|
+
/* ------------------------------------------------------ */
|
10
|
+
/* Blogeintraege */
|
11
|
+
/* ------------------------------------------------------ */
|
12
|
+
|
13
|
+
.entry-content pre, .entry-content code {
|
14
|
+
overflow: scroll;
|
15
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
/*
|
2
|
+
* harvester_lte_ie6.css
|
3
|
+
*
|
4
|
+
* Description: Anpassungen des Defaultstylesheets 'harvester.css'
|
5
|
+
für IE6 und aelter Versionen
|
6
|
+
*
|
7
|
+
*/
|
8
|
+
|
9
|
+
/* ------------------------------------------------------ */
|
10
|
+
/* Blogeintraege */
|
11
|
+
/* ------------------------------------------------------ */
|
12
|
+
|
13
|
+
.entry {
|
14
|
+
width: 100%;
|
15
|
+
/*width: 98%;*/ /* dirty hack */
|
16
|
+
}
|
17
|
+
|
18
|
+
.entry-content pre,
|
19
|
+
.entry-content code {
|
20
|
+
width: 90%; /* dirty hack */
|
21
|
+
overflow: scroll;
|
22
|
+
}
|
23
|
+
|
24
|
+
.tickers3 .ticker ul li {
|
25
|
+
padding-bottom: 1em;
|
26
|
+
width: 100%;
|
27
|
+
}
|
data/harvester.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rubygems' unless defined? Gem
|
3
|
+
require File.dirname(__FILE__) + "/lib/harvester"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "harvester"
|
7
|
+
s.version = Harvester::VERSION
|
8
|
+
s.authors = ["astro", "Neingeist", "Tigion", "Josef Spillner", "Jan Lelis"]
|
9
|
+
s.email = "mail@janlelis.de"
|
10
|
+
s.homepage = "https://github.com/janlelis/harvester"
|
11
|
+
s.summary = "Web-based feed aggregator"
|
12
|
+
s.description = "The harvester collects your favourite feeds and generates static html/feed pages"
|
13
|
+
s.required_ruby_version = ">= 1.9.2"
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
# main
|
16
|
+
s.add_dependency 'rdbi'
|
17
|
+
s.add_dependency 'rdbi-driver-sqlite3'
|
18
|
+
s.add_dependency 'logger-colors'
|
19
|
+
# fetch
|
20
|
+
s.add_dependency 'eventmachine'
|
21
|
+
s.add_dependency 'em-http-request'
|
22
|
+
# generate
|
23
|
+
s.add_dependency 'ruby-xslt'
|
24
|
+
s.add_dependency 'hpricot'
|
25
|
+
# chart
|
26
|
+
s.add_dependency 'rmagick'
|
27
|
+
s.add_dependency 'gruff'
|
28
|
+
# clock
|
29
|
+
s.add_dependency 'clockwork'
|
30
|
+
# jabber
|
31
|
+
# s.add_dependency 'xmpp4r'
|
32
|
+
s.files = Dir.glob(%w|lib/**/*.rb bin/* [A-Z]*.{txt,rdoc} data/**/* *.yaml|) + %w|Rakefile harvester.gemspec|
|
33
|
+
s.executables = Dir['bin/*'].map{|f| File.basename f }
|
34
|
+
s.license = "AGPL"
|
35
|
+
end
|
data/lib/harvester.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'rubygems' unless defined? Gem
|
2
|
+
require 'rdbi'
|
3
|
+
require 'yaml'
|
4
|
+
require 'logger/colors'
|
5
|
+
|
6
|
+
class Harvester
|
7
|
+
VERSION = '0.8.0.pre.1'
|
8
|
+
|
9
|
+
attr_reader :config, :settings, :collections, :dbi, :logger
|
10
|
+
|
11
|
+
# takes an options string (to which overwrites settings) and creates a new Harvester instance
|
12
|
+
def initialize(options = {})
|
13
|
+
# command line options
|
14
|
+
options['config'] ||= './config.yaml'
|
15
|
+
|
16
|
+
# load config
|
17
|
+
begin
|
18
|
+
config_path = File.expand_path( options.delete('config') )
|
19
|
+
@config = YAML::load_file config_path
|
20
|
+
rescue Errno::ENOENT
|
21
|
+
raise LoadError, "Could not find a yaml config file at #{ config_path }"
|
22
|
+
end
|
23
|
+
|
24
|
+
# instance variable helpers
|
25
|
+
@settings = {
|
26
|
+
'collections' => 'collections.yaml',
|
27
|
+
'timeout' => 90,
|
28
|
+
'size limit' => 200_000,
|
29
|
+
'log_level' => Logger::DEBUG, # 0
|
30
|
+
'log_file' => STDOUT,
|
31
|
+
}
|
32
|
+
@settings.merge! @config['settings']
|
33
|
+
@settings.merge! options
|
34
|
+
|
35
|
+
# load collections
|
36
|
+
begin
|
37
|
+
@collections = YAML::load_file @settings['collections']
|
38
|
+
rescue Errno::ENOENT
|
39
|
+
raise LoadError, "Could not find a yaml collections file at #{ File.expand_path( options[:config] ) }"
|
40
|
+
end
|
41
|
+
|
42
|
+
# init logger
|
43
|
+
@logger = Logger.new(
|
44
|
+
if !@settings['log_file'] || @settings['log_file'] =~ /^STD(?:OUT|ERR)$/
|
45
|
+
Object.const_get(@settings['log_file'])
|
46
|
+
else
|
47
|
+
@settings['log_file']
|
48
|
+
end
|
49
|
+
)
|
50
|
+
@logger.formatter = proc { |level, datetime, appname, msg| "#{msg}\n" }
|
51
|
+
@logger.level = if %w[debug info warn error fatal].include?(@settings['log_level'].to_s.downcase)
|
52
|
+
Logger::Severity.const_get(@settings['log_level'].to_s.upcase)
|
53
|
+
else
|
54
|
+
@settings['log_level'].to_i
|
55
|
+
end
|
56
|
+
|
57
|
+
# connect to db
|
58
|
+
begin
|
59
|
+
require 'rdbi/driver/' + config['db']['driver'].downcase # FIXME?
|
60
|
+
|
61
|
+
@dbi = RDBI::connect config['db']['driver'],
|
62
|
+
database: config['db']['database'],
|
63
|
+
user: config['db']['user'],
|
64
|
+
password: config['db']['password']
|
65
|
+
rescue Exception
|
66
|
+
error 'Something is wrong with your database settings:'
|
67
|
+
raise
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# creates a new harvester using the command-line options to configure it
|
72
|
+
def self.new_from_argv
|
73
|
+
options = {}
|
74
|
+
|
75
|
+
require 'optparse'
|
76
|
+
OptionParser.new do |op|
|
77
|
+
op.banner = %q{USAGE:
|
78
|
+
harvester <COMMAND> [OPTIONS]
|
79
|
+
COMMANDS:
|
80
|
+
run run a complete harvester update
|
81
|
+
fetch run only the fetch script
|
82
|
+
generate run only the generate script
|
83
|
+
chart run only the generate chart script
|
84
|
+
post run only the post processing script
|
85
|
+
db start a database task (create or maintenance)
|
86
|
+
clock start the scheduler (cron replacement)
|
87
|
+
new create a new harvester project
|
88
|
+
jabber start the jabber bot (not implemented yet)
|
89
|
+
OPTIONS:} # automatically added as --help
|
90
|
+
op.on('-v', '--version') do
|
91
|
+
puts Harvester::VERSION
|
92
|
+
exit
|
93
|
+
end
|
94
|
+
op.on('-c', '--config FILE') do |config|
|
95
|
+
options['config'] = config
|
96
|
+
end
|
97
|
+
op.on('-l', '--log_file FILE') do |log_file|
|
98
|
+
options['log_file'] = log_file
|
99
|
+
end
|
100
|
+
op.on('-L', '--log_level NUMBER') do |log_level|
|
101
|
+
options['log_level'] = log_level
|
102
|
+
end
|
103
|
+
op.on('-p', '--post_script FILE') do |post_script|
|
104
|
+
options['post_script'] = post_script
|
105
|
+
end
|
106
|
+
op.on('-m', '--no-maintenance') do
|
107
|
+
options['no-maintenance'] = true
|
108
|
+
end
|
109
|
+
op.on('-s', '--no-chart') do
|
110
|
+
options['no-chart'] = true
|
111
|
+
end
|
112
|
+
end.parse!
|
113
|
+
|
114
|
+
Harvester.new options
|
115
|
+
end
|
116
|
+
|
117
|
+
protected
|
118
|
+
|
119
|
+
# logger helpers
|
120
|
+
def debug(msg) @logger.debug(msg) end
|
121
|
+
def info(msg) @logger.info(msg) end
|
122
|
+
def warn(msg) @logger.warn(msg) end
|
123
|
+
def error(msg) @logger.error(msg) end
|
124
|
+
def fatal(msg) @logger.fatal(msg) end
|
125
|
+
|
126
|
+
# adds an info message before and after the block
|
127
|
+
def task(msg) # MAYBE: nested spaces+behaviour
|
128
|
+
info "[start] " + msg
|
129
|
+
yield
|
130
|
+
info "[done ] " + msg
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
require_relative '../harvester/db'
|
5
|
+
require 'gruff'
|
6
|
+
|
7
|
+
class Harvester
|
8
|
+
module CHART; end
|
9
|
+
|
10
|
+
# generates a fetch statistic image
|
11
|
+
def chart!
|
12
|
+
info "CHART"
|
13
|
+
task "generate chart" do
|
14
|
+
c = Chart::StatsPerCollection.new
|
15
|
+
@dbi.execute( File.read( sql_query(:chart) ) ).each{ |date,collection|
|
16
|
+
c.add_one(collection, Date.parse(date).day)
|
17
|
+
}
|
18
|
+
Chart.new(c).write File.join( @config['settings']['output'], '/chart.jpg' )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# generates a fetch statistics image using gruff
|
24
|
+
class Harvester::Chart
|
25
|
+
def initialize(stats, options = {}) # TODO configure g with options
|
26
|
+
@g = Gruff::Line.new(300)
|
27
|
+
@g.title = "Harvested items per day"
|
28
|
+
@g.x_axis_label = "Days"
|
29
|
+
@g.y_axis_label = "Items"
|
30
|
+
|
31
|
+
stats.each(&@g.method(:data))
|
32
|
+
|
33
|
+
labels = {}
|
34
|
+
stats.days.each_with_index do |d,i|
|
35
|
+
labels[i] = d.to_s
|
36
|
+
end
|
37
|
+
@g.labels = labels
|
38
|
+
end
|
39
|
+
|
40
|
+
def write(path)
|
41
|
+
@g.write(path)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class Harvester::Chart::StatsPerCollection
|
46
|
+
attr_reader :days
|
47
|
+
|
48
|
+
def initialize
|
49
|
+
@collections = {}
|
50
|
+
@days = []
|
51
|
+
end
|
52
|
+
|
53
|
+
def add_one(collection, day)
|
54
|
+
@days << day unless @days.index(day)
|
55
|
+
collection ||= '(unknown)' # TODO research
|
56
|
+
|
57
|
+
c = @collections[collection] || {}
|
58
|
+
c[day] = (c[day] || 0) + 1
|
59
|
+
@collections[collection] = c
|
60
|
+
end
|
61
|
+
|
62
|
+
def each
|
63
|
+
@collections.each { |n,c|
|
64
|
+
v = []
|
65
|
+
@days.each { |d|
|
66
|
+
v << c[d].to_i
|
67
|
+
}
|
68
|
+
|
69
|
+
yield n, v
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end
|
data/lib/harvester/db.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
|
5
|
+
class Harvester
|
6
|
+
module DB; end
|
7
|
+
|
8
|
+
# creates required database structure
|
9
|
+
def create!
|
10
|
+
task "create database tables" do
|
11
|
+
begin @dbi.transaction do
|
12
|
+
sql_queries(:create).each{ |sql|
|
13
|
+
info "* execute " + File.basename(sql)
|
14
|
+
@dbi.execute File.read(sql)
|
15
|
+
}
|
16
|
+
end; end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# check for feed source changes
|
21
|
+
def maintenance!
|
22
|
+
task "look for sources to purge" do
|
23
|
+
purge = []
|
24
|
+
@dbi.execute("SELECT collection, rss FROM sources").each{ |dbc,dbr|
|
25
|
+
purge << [dbc, dbr] unless (@collections[dbc] || []).include? dbr
|
26
|
+
}
|
27
|
+
|
28
|
+
purge_rss = []
|
29
|
+
purge.each { |c,r|
|
30
|
+
info "* remove #{c}:#{r}..."
|
31
|
+
@dbi.execute "DELETE FROM sources WHERE collection=? AND rss=?", c, r
|
32
|
+
purge_rss << r
|
33
|
+
}
|
34
|
+
|
35
|
+
purge_rss.delete_if { |r|
|
36
|
+
purge_this = true
|
37
|
+
|
38
|
+
@collections.each { |cfc,cfr|
|
39
|
+
if purge_this
|
40
|
+
warn "* must keep #{r} because it's still in #{cfc}" if cfr && cfr.include?(r)
|
41
|
+
purge_this = !(cfr && cfr.include?(r))
|
42
|
+
end
|
43
|
+
}
|
44
|
+
|
45
|
+
!purge_this
|
46
|
+
}
|
47
|
+
purge_rss.each { |r|
|
48
|
+
info "* purge items from feed #{r}"
|
49
|
+
@dbi.execute "DELETE FROM items WHERE rss=?", r
|
50
|
+
}
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def update(rss_url, new_source, collection, response, rss_url_nice = rss_url)
|
57
|
+
rss = MRSS.parse(response)
|
58
|
+
|
59
|
+
begin @dbi.transaction do
|
60
|
+
# update source
|
61
|
+
if new_source
|
62
|
+
@dbi.execute "INSERT INTO sources (collection, rss, last, title, link, description) VALUES (?, ?, ?, ?, ?, ?)",
|
63
|
+
collection, rss_url, response['Last-Modified'], rss.title, rss.link, rss.description
|
64
|
+
info rss_url_nice + "Added as source"
|
65
|
+
else
|
66
|
+
@dbi.execute "UPDATE sources SET last=?, title=?, link=?, description=? WHERE collection=? AND rss=?",
|
67
|
+
response['Last-Modified'], rss.title, rss.link, rss.description, collection, rss_url
|
68
|
+
debug rss_url_nice + "Source updated"
|
69
|
+
end
|
70
|
+
|
71
|
+
# update items
|
72
|
+
items_new, items_updated = 0, 0
|
73
|
+
rss.items.each { |item|
|
74
|
+
description = item.description
|
75
|
+
|
76
|
+
# Link mangling
|
77
|
+
begin
|
78
|
+
link = URI::join((rss.link.to_s == '') ? uri.to_s : rss.link.to_s, item.link || rss.link).to_s
|
79
|
+
rescue URI::Error
|
80
|
+
link = item.link
|
81
|
+
end
|
82
|
+
|
83
|
+
# Push into database
|
84
|
+
db_title, = *@dbi.execute("SELECT title FROM items WHERE rss=? AND link=?", rss_url, link).fetch
|
85
|
+
|
86
|
+
if db_title.nil? || db_title.empty? # item is new
|
87
|
+
begin
|
88
|
+
@dbi.execute "INSERT INTO items (rss, title, link, date, description) VALUES (?, ?, ?, ?, ?)",
|
89
|
+
rss_url, item.title, link, item.date.to_s, description
|
90
|
+
items_new += 1
|
91
|
+
#rescue DBI::ProgrammingError
|
92
|
+
# puts description
|
93
|
+
# puts "#{$!.class}: #{$!}\n#{$!.backtrace.join("\n")}"
|
94
|
+
end
|
95
|
+
else
|
96
|
+
@dbi.execute "UPDATE items SET title=?, description=? WHERE rss=? AND link=?",
|
97
|
+
item.title, description, rss_url, link
|
98
|
+
items_updated += 1
|
99
|
+
end
|
100
|
+
|
101
|
+
# Remove all enclosures
|
102
|
+
@dbi.execute "DELETE FROM enclosures WHERE rss=? AND link=?", rss_url, link
|
103
|
+
|
104
|
+
# Re-add all enclosures
|
105
|
+
item.enclosures.each do |enclosure|
|
106
|
+
href = URI::join((rss.link.to_s == '') ? link.to_s : rss.link.to_s, enclosure['href']).to_s
|
107
|
+
@dbi.execute "INSERT INTO enclosures (rss, link, href, mime, title, length) VALUES (?, ?, ?, ?, ?, ?)",
|
108
|
+
rss_url, link, href, enclosure['type'], enclosure['title'],
|
109
|
+
!enclosure['length'] || enclosure['length'].empty? ? 0 : enclosure['length']
|
110
|
+
end
|
111
|
+
}
|
112
|
+
info rss_url_nice + "#{ items_new } new items, #{ items_updated } updated"
|
113
|
+
end; end
|
114
|
+
end
|
115
|
+
|
116
|
+
def sql_queries(task)
|
117
|
+
Dir[ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }*.sql" ].each
|
118
|
+
end
|
119
|
+
|
120
|
+
def sql_query(task)
|
121
|
+
File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }.sql"
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
require_relative 'db'
|
5
|
+
require_relative 'mrss'
|
6
|
+
|
7
|
+
require 'eventmachine'
|
8
|
+
require 'em-http'
|
9
|
+
require 'uri'
|
10
|
+
|
11
|
+
class Harvester
|
12
|
+
module FETCH; end
|
13
|
+
|
14
|
+
# fetches new feed updates and store them in the database
|
15
|
+
def fetch!
|
16
|
+
info "FETCH"
|
17
|
+
maintenance! unless @settings['no-maintenance']
|
18
|
+
Fetcher.run @dbi, @collections, @settings, @logger do |*args| update(*args) end # results will be passed to the update function
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# fetches new feed updates and store them in the database using Eventmachine
|
23
|
+
module Harvester::Fetcher
|
24
|
+
def self.run(dbi, collections, settings, logger)
|
25
|
+
logger.info '[start] fetch using Eventmachine'
|
26
|
+
|
27
|
+
# prepare logger
|
28
|
+
max_url_size = collections.inject(0){ |acc, (_,rss_urls)| # log display hack
|
29
|
+
if rss_urls
|
30
|
+
max = rss_urls.max_by(&:size).size.to_i
|
31
|
+
acc > max ? acc : max
|
32
|
+
else
|
33
|
+
acc
|
34
|
+
end
|
35
|
+
}
|
36
|
+
|
37
|
+
#dbi['AutoCommit'] = false # TODO check for rdbi
|
38
|
+
|
39
|
+
EventMachine.run do
|
40
|
+
pending = []
|
41
|
+
collections.each{ |collection, rss_urls|
|
42
|
+
rss_urls and rss_urls.each{ |rss_url|
|
43
|
+
# prepare log prefix
|
44
|
+
rss_url_nice = '* ' + rss_url.ljust(max_url_size) + ' | '
|
45
|
+
|
46
|
+
# get last_modified or if new
|
47
|
+
db_rss, last = dbi.execute("SELECT rss, last FROM sources WHERE collection=? AND rss=?",
|
48
|
+
collection, rss_url).fetch
|
49
|
+
new_source = db_rss.nil? || db_rss.empty?
|
50
|
+
uri = URI.parse rss_url
|
51
|
+
|
52
|
+
# prepare request
|
53
|
+
header = {}
|
54
|
+
header['Authorization'] = [uri.user, uri.password] if uri.user
|
55
|
+
|
56
|
+
if new_source || last.nil?
|
57
|
+
logger.info rss_url_nice + "GET"
|
58
|
+
else
|
59
|
+
logger.info rss_url_nice + "GET with If-Modified-Since: #{ last }"
|
60
|
+
header['If-Modified-Since'] = last
|
61
|
+
end
|
62
|
+
|
63
|
+
# do request
|
64
|
+
pending << rss_url
|
65
|
+
http = EM::HttpRequest.new(uri).get :head => header
|
66
|
+
|
67
|
+
http.errback do
|
68
|
+
logger.error rss_url_nice + "Request Error: #{ http.error }"
|
69
|
+
|
70
|
+
pending.delete rss_url
|
71
|
+
EM.stop if pending.empty?
|
72
|
+
end
|
73
|
+
|
74
|
+
http.callback do
|
75
|
+
if http.response_header.status != 200
|
76
|
+
logger.warn rss_url_nice + "HTTP not OK, but: #{ http.response_header.status }"
|
77
|
+
elsif http.response.size > settings['size limit'].to_i
|
78
|
+
logger.warn rss_url_nice + "Got too big repsonse: #{ response.size } bytes"
|
79
|
+
else
|
80
|
+
yield rss_url, new_source, collection, http.response, rss_url_nice
|
81
|
+
end
|
82
|
+
|
83
|
+
pending.delete rss_url # same url twice?
|
84
|
+
EM.stop if pending.empty?
|
85
|
+
end
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
EM.add_timer(settings['timeout'].to_i){
|
90
|
+
pending.each { |rss_url| logger.warn rss_url_nice + 'Timed out' }
|
91
|
+
EM.stop
|
92
|
+
}
|
93
|
+
end
|
94
|
+
logger.info '[done ] fetch using Eventmachine'
|
95
|
+
end
|
96
|
+
end
|