harvester 0.8.0.pre.2 → 0.8.0.pre.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +20 -15
- data/bin/harvester-create +5 -0
- data/bin/harvester-jabber +1 -1
- data/bin/harvester-maintenance +5 -0
- data/bin/harvester-run +5 -2
- data/bin/harvester-stats +5 -0
- data/data/templates/index.html +0 -2
- data/lib/harvester.rb +17 -9
- data/lib/harvester/create.rb +19 -0
- data/lib/harvester/fetch.rb +68 -7
- data/lib/harvester/generate.rb +8 -8
- data/lib/harvester/jabber.rb +2 -0
- data/lib/harvester/maintenance.rb +42 -0
- data/lib/harvester/mrss.rb +4 -3
- data/lib/harvester/post.rb +3 -2
- data/lib/harvester/{chart.rb → stats.rb} +3 -5
- metadata +11 -8
- data/bin/harvester-chart +0 -5
- data/bin/harvester-db +0 -15
- data/lib/harvester/db.rb +0 -123
data/README.rdoc
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
= Harvester 0.8
|
2
2
|
The Harvester is a web-based Feed-Aggregator
|
3
3
|
|
4
|
-
|
5
|
-
http://
|
4
|
+
Running instances can be seen at:
|
5
|
+
http://blog-harvester.de or http://rubynetz.de
|
6
6
|
|
7
7
|
The Harvester eats the feeds you want and produces a static html/feed page that aggregates all those.
|
8
8
|
|
@@ -14,7 +14,7 @@ Harvester 0.8 is alpha quality! There are still some unfixed bugs (e.g. database
|
|
14
14
|
|
15
15
|
Install the harvester with
|
16
16
|
|
17
|
-
gem install harvester
|
17
|
+
gem install harvester --pre
|
18
18
|
|
19
19
|
You can now create a new harvester project with
|
20
20
|
|
@@ -31,16 +31,6 @@ Then you will need to configure <tt>config.yaml</tt>, <tt>collections.yaml</tt>
|
|
31
31
|
|
32
32
|
The <tt>collections.yaml</tt> file contains the links to your desired feeds.
|
33
33
|
|
34
|
-
== Todo
|
35
|
-
* Still some things broken after update
|
36
|
-
* Fix database issues
|
37
|
-
* Improve/fix feed parsing
|
38
|
-
* Tidy up templates
|
39
|
-
* Security issues
|
40
|
-
* Optimize performance
|
41
|
-
* Fix jabberbot
|
42
|
-
* Implement some kind of tag filters
|
43
|
-
|
44
34
|
== Usage
|
45
35
|
|
46
36
|
Then run
|
@@ -55,13 +45,13 @@ This will create the output files. That's it ;).
|
|
55
45
|
|
56
46
|
Usually, you want to run this command automatically every x seconds/minutes. You can use a program like <tt>cron</tt> or simply run
|
57
47
|
|
58
|
-
harvester clock
|
48
|
+
harvester clock <time>
|
59
49
|
|
60
50
|
to start a simple scheduler.
|
61
51
|
|
62
52
|
There some more harvester commands to explore. Run
|
63
53
|
|
64
|
-
harvester
|
54
|
+
harvester
|
65
55
|
|
66
56
|
to get a command list.
|
67
57
|
|
@@ -69,6 +59,21 @@ to get a command list.
|
|
69
59
|
|
70
60
|
...
|
71
61
|
|
62
|
+
|
63
|
+
== Todo
|
64
|
+
* Still some things broken after update
|
65
|
+
* Fix database issues
|
66
|
+
* Fix wrong tweet escaping (sometimes: ä-->ä)
|
67
|
+
* Improve/fix feed parsing (especially atom)
|
68
|
+
* Fix chart generation
|
69
|
+
* Tidy up templates / remove astro-specific links
|
70
|
+
* Extract mrss.rb into extra gem
|
71
|
+
|
72
|
+
* Security issues
|
73
|
+
* Optimize performance
|
74
|
+
* Fix jabber bot
|
75
|
+
* Implement some kind of tag filters
|
76
|
+
|
72
77
|
== Credits
|
73
78
|
|
74
79
|
* {Astro}[https://github.com/astro] (2005-2008)
|
data/bin/harvester-jabber
CHANGED
data/bin/harvester-run
CHANGED
@@ -2,13 +2,16 @@
|
|
2
2
|
# encoding: utf-8
|
3
3
|
|
4
4
|
# run a complete harvester update :)
|
5
|
+
require_relative '../lib/harvester'
|
5
6
|
require_relative '../lib/harvester/fetch'
|
6
7
|
require_relative '../lib/harvester/generate'
|
7
|
-
require_relative '../lib/harvester/chart'
|
8
8
|
require_relative '../lib/harvester/post'
|
9
9
|
|
10
10
|
harve = Harvester.new_from_argv
|
11
|
+
( require_relative '../lib/harvester/maintenance'
|
12
|
+
harve.maintenance! ) if harve.settings['maintenance']
|
11
13
|
harve.fetch!
|
12
14
|
harve.generate!
|
13
|
-
|
15
|
+
( require_relative '../lib/harvester/stats'
|
16
|
+
harve.stats! ) if harve.settings['stats']
|
14
17
|
harve.post!
|
data/bin/harvester-stats
ADDED
data/data/templates/index.html
CHANGED
data/lib/harvester.rb
CHANGED
@@ -4,7 +4,7 @@ require 'yaml'
|
|
4
4
|
require 'logger/colors'
|
5
5
|
|
6
6
|
class Harvester
|
7
|
-
VERSION = '0.8.0.pre.
|
7
|
+
VERSION = '0.8.0.pre.3'
|
8
8
|
|
9
9
|
attr_reader :config, :settings, :collections, :dbi, :logger
|
10
10
|
|
@@ -62,7 +62,7 @@ class Harvester
|
|
62
62
|
database: config['db']['database'],
|
63
63
|
user: config['db']['user'],
|
64
64
|
password: config['db']['password'],
|
65
|
-
host: "localhost",
|
65
|
+
host: "localhost", # FIXME?
|
66
66
|
rescue Exception
|
67
67
|
error 'Something is wrong with your database settings:'
|
68
68
|
raise
|
@@ -81,7 +81,7 @@ COMMANDS:
|
|
81
81
|
run run a complete harvester update
|
82
82
|
fetch run only the fetch script
|
83
83
|
generate run only the generate script
|
84
|
-
|
84
|
+
stats run only the generate chart script
|
85
85
|
post run only the post processing script
|
86
86
|
db start a database task (create or maintenance)
|
87
87
|
clock start the scheduler (cron replacement)
|
@@ -104,11 +104,11 @@ OPTIONS:} # automatically added as --help
|
|
104
104
|
op.on('-p', '--post_script FILE') do |post_script|
|
105
105
|
options['post_script'] = post_script
|
106
106
|
end
|
107
|
-
op.on('-m', '--
|
108
|
-
options['
|
107
|
+
op.on('-m', '--maintenance') do
|
108
|
+
options['maintenance'] = true
|
109
109
|
end
|
110
|
-
op.on('-
|
111
|
-
options['
|
110
|
+
op.on('-s', '--stats') do
|
111
|
+
options['stats'] = true
|
112
112
|
end
|
113
113
|
end.parse!
|
114
114
|
|
@@ -124,10 +124,18 @@ OPTIONS:} # automatically added as --help
|
|
124
124
|
def error(msg) @logger.error(msg) end
|
125
125
|
def fatal(msg) @logger.fatal(msg) end
|
126
126
|
|
127
|
-
# adds an info message before and after the block
|
128
|
-
def task(msg) # MAYBE: nested spaces+behaviour
|
127
|
+
def task(msg) # adds an info message before and after the block MAYBE: nested spaces+behaviour
|
129
128
|
info "[start] " + msg
|
130
129
|
yield
|
131
130
|
info "[done ] " + msg
|
132
131
|
end
|
132
|
+
|
133
|
+
# database helpers
|
134
|
+
def sql_queries(task)
|
135
|
+
Dir[ File.dirname(__FILE__) + "/../data/sql/#{ @config['db']['driver'].downcase }/#{ task }*.sql" ].each
|
136
|
+
end
|
137
|
+
|
138
|
+
def sql_query(task)
|
139
|
+
File.dirname(__FILE__) + "/../data/sql/#{ @config['db']['driver'].downcase }/#{ task }.sql"
|
140
|
+
end
|
133
141
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
|
5
|
+
class Harvester
|
6
|
+
CREATE = true
|
7
|
+
# creates required database structure
|
8
|
+
def create!
|
9
|
+
info "CREATE"
|
10
|
+
task "create database tables" do
|
11
|
+
begin @dbi.transaction do
|
12
|
+
sql_queries(:create).each{ |sql|
|
13
|
+
info "* execute " + File.basename(sql)
|
14
|
+
@dbi.execute File.read(sql)
|
15
|
+
}
|
16
|
+
end; end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/harvester/fetch.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require_relative '../harvester'
|
4
|
-
require_relative 'db'
|
5
4
|
require_relative 'mrss'
|
6
5
|
|
7
6
|
require 'eventmachine'
|
@@ -9,13 +8,74 @@ require 'em-http'
|
|
9
8
|
require 'uri'
|
10
9
|
|
11
10
|
class Harvester
|
12
|
-
|
13
|
-
|
11
|
+
FETCH = true
|
14
12
|
# fetches new feed updates and store them in the database
|
15
13
|
def fetch!
|
16
14
|
info "FETCH"
|
17
|
-
|
18
|
-
|
15
|
+
Fetcher.run @dbi, @collections, @settings, @logger do |*args| update_db(*args) end # results will be passed to the update function
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# saves result of a request in db
|
21
|
+
def update_db(rss_url, new_source, collection, response, rss_url_nice = rss_url)
|
22
|
+
rss = MRSS.parse(response)
|
23
|
+
|
24
|
+
begin @dbi.transaction do
|
25
|
+
# update source
|
26
|
+
if new_source
|
27
|
+
@dbi.execute "INSERT INTO sources (collection, rss, last, title, link, description) VALUES (?, ?, ?, ?, ?, ?)",
|
28
|
+
collection, rss_url, response['Last-Modified'], rss.title, rss.link, rss.description
|
29
|
+
info rss_url_nice + "Added as source"
|
30
|
+
else
|
31
|
+
@dbi.execute "UPDATE sources SET last=?, title=?, link=?, description=? WHERE collection=? AND rss=?",
|
32
|
+
response['Last-Modified'], rss.title, rss.link, rss.description, collection, rss_url
|
33
|
+
debug rss_url_nice + "Source updated"
|
34
|
+
end
|
35
|
+
|
36
|
+
# update items
|
37
|
+
items_new, items_updated = 0, 0
|
38
|
+
rss.items.each { |item|
|
39
|
+
description = item.description
|
40
|
+
|
41
|
+
# Link mangling
|
42
|
+
begin
|
43
|
+
link = URI::join((rss.link.to_s == '') ? URI.parse(rss_url).to_s : rss.link.to_s, item.link || rss.link).to_s
|
44
|
+
rescue URI::Error
|
45
|
+
link = item.link
|
46
|
+
end
|
47
|
+
|
48
|
+
# Push into database
|
49
|
+
db_title, = *@dbi.execute("SELECT title FROM items WHERE rss=? AND link=?", rss_url, link).fetch
|
50
|
+
|
51
|
+
if db_title.nil? || db_title.empty? # item is new
|
52
|
+
begin
|
53
|
+
@dbi.execute "INSERT INTO items (rss, title, link, date, description) VALUES (?, ?, ?, ?, ?)",
|
54
|
+
rss_url, item.title, link, item.date.to_s, description
|
55
|
+
items_new += 1
|
56
|
+
#rescue DBI::ProgrammingError
|
57
|
+
# puts description
|
58
|
+
# puts "#{$!.class}: #{$!}\n#{$!.backtrace.join("\n")}"
|
59
|
+
end
|
60
|
+
else
|
61
|
+
@dbi.execute "UPDATE items SET title=?, description=?, date=? WHERE rss=? AND link=?",
|
62
|
+
item.title, description, item.date.to_s, rss_url, link
|
63
|
+
items_updated += 1
|
64
|
+
end
|
65
|
+
|
66
|
+
# Remove all enclosures
|
67
|
+
@dbi.execute "DELETE FROM enclosures WHERE rss=? AND link=?", rss_url, link
|
68
|
+
|
69
|
+
# Re-add all enclosures
|
70
|
+
item.enclosures.each do |enclosure|
|
71
|
+
href = URI::join((rss.link.to_s == '') ? link.to_s : rss.link.to_s, enclosure['href']).to_s
|
72
|
+
@dbi.execute "INSERT INTO enclosures (rss, link, href, mime, title, length) VALUES (?, ?, ?, ?, ?, ?)",
|
73
|
+
rss_url, link, href, enclosure['type'], enclosure['title'],
|
74
|
+
!enclosure['length'] || enclosure['length'].empty? ? 0 : enclosure['length']
|
75
|
+
end
|
76
|
+
}
|
77
|
+
info rss_url_nice + "#{ items_new } new items, #{ items_updated } updated"
|
78
|
+
end; end
|
19
79
|
end
|
20
80
|
end
|
21
81
|
|
@@ -47,7 +107,7 @@ module Harvester::Fetcher
|
|
47
107
|
db_rss, last = dbi.execute("SELECT rss, last FROM sources WHERE collection=? AND rss=?",
|
48
108
|
collection, rss_url).fetch
|
49
109
|
new_source = db_rss.nil? || db_rss.empty?
|
50
|
-
uri = URI.parse
|
110
|
+
uri = URI.parse(rss_url)
|
51
111
|
|
52
112
|
# prepare request
|
53
113
|
header = {}
|
@@ -77,7 +137,7 @@ module Harvester::Fetcher
|
|
77
137
|
elsif http.response.size > settings['size limit'].to_i
|
78
138
|
logger.warn rss_url_nice + "Got too big repsonse: #{ response.size } bytes"
|
79
139
|
else
|
80
|
-
yield rss_url, new_source, collection, http.response, rss_url_nice
|
140
|
+
yield rss_url, new_source, collection, http.response, rss_url_nice # TODO clean up
|
81
141
|
end
|
82
142
|
|
83
143
|
pending.delete rss_url # same url twice?
|
@@ -85,6 +145,7 @@ module Harvester::Fetcher
|
|
85
145
|
end
|
86
146
|
}
|
87
147
|
}
|
148
|
+
EM.stop if pending.empty? # e.g. no collections configured
|
88
149
|
|
89
150
|
EM.add_timer(settings['timeout'].to_i){
|
90
151
|
pending.each { |rss_url| logger.warn rss_url_nice + 'Timed out' }
|
data/lib/harvester/generate.rb
CHANGED
@@ -14,13 +14,12 @@ rescue LoadError
|
|
14
14
|
end
|
15
15
|
|
16
16
|
class Harvester
|
17
|
-
|
18
|
-
|
17
|
+
GENERATE = true
|
19
18
|
# generates the static html/feed files
|
20
19
|
def generate!
|
21
20
|
info "GENERATE"
|
22
21
|
|
23
|
-
f = Generator.new @dbi, @logger
|
22
|
+
f = Generator.new @dbi, @settings, @logger
|
24
23
|
xslt = XML::XSLT.new
|
25
24
|
xslt.xml = f.generate_root.to_s
|
26
25
|
|
@@ -54,9 +53,10 @@ end
|
|
54
53
|
class Harvester::Generator
|
55
54
|
FUNC_NAMESPACE = 'http://astroblog.spaceboyz.net/harvester/xslt-functions'
|
56
55
|
|
57
|
-
def initialize(dbi, logger)
|
58
|
-
@dbi
|
59
|
-
@
|
56
|
+
def initialize(dbi, settings, logger) # TODO default values for arg2 and arg3
|
57
|
+
@dbi = dbi
|
58
|
+
@settings = settings
|
59
|
+
@logger = logger
|
60
60
|
%w(collection-items feed-items item-description item-images item-enclosures).each { |func|
|
61
61
|
XML::XSLT.extFunction(func, FUNC_NAMESPACE, self)
|
62
62
|
}
|
@@ -80,7 +80,7 @@ class Harvester::Generator
|
|
80
80
|
EntityTranslator.run(root, true, @logger)
|
81
81
|
end
|
82
82
|
|
83
|
-
def collection_items(collection, max=
|
83
|
+
def collection_items(collection, max = 99)
|
84
84
|
items = REXML::Element.new('items')
|
85
85
|
@dbi.execute("SELECT items.title,items.date,items.link,items.rss FROM items,sources WHERE items.rss=sources.rss AND sources.collection LIKE ? ORDER BY items.date DESC LIMIT ?", collection, max.to_i).each{ |title,date,link,rss|
|
86
86
|
if title # TODO: debug (sqlite)
|
@@ -95,7 +95,7 @@ class Harvester::Generator
|
|
95
95
|
EntityTranslator.run(items, true, @logger)
|
96
96
|
end
|
97
97
|
|
98
|
-
def feed_items(rss, max=23)
|
98
|
+
def feed_items(rss, max = 23)
|
99
99
|
items = REXML::Element.new('items')
|
100
100
|
@dbi.execute("SELECT title,date,link FROM items WHERE rss=? ORDER BY date DESC LIMIT ?", rss, max.to_i).each{ |title,date,link| #p rss,title,date,link
|
101
101
|
# p title
|
data/lib/harvester/jabber.rb
CHANGED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../harvester'
|
4
|
+
|
5
|
+
class Harvester
|
6
|
+
MAINTENANCE = true
|
7
|
+
# check for feed source changes
|
8
|
+
def maintenance!
|
9
|
+
info "MAINTENANCE"
|
10
|
+
task "look for sources to purge" do
|
11
|
+
purge = []
|
12
|
+
@dbi.execute("SELECT collection, rss FROM sources").each{ |dbc,dbr|
|
13
|
+
purge << [dbc, dbr] unless (@collections[dbc] || []).include? dbr
|
14
|
+
}
|
15
|
+
|
16
|
+
purge_rss = []
|
17
|
+
purge.each { |c,r|
|
18
|
+
info "* remove #{c}:#{r}..."
|
19
|
+
@dbi.execute "DELETE FROM sources WHERE collection=? AND rss=?", c, r
|
20
|
+
purge_rss << r
|
21
|
+
}
|
22
|
+
|
23
|
+
purge_rss.delete_if { |r|
|
24
|
+
purge_this = true
|
25
|
+
|
26
|
+
@collections.each { |cfc,cfr|
|
27
|
+
if purge_this
|
28
|
+
warn "* must keep #{r} because it's still in #{cfc}" if cfr && cfr.include?(r)
|
29
|
+
purge_this = !(cfr && cfr.include?(r))
|
30
|
+
end
|
31
|
+
}
|
32
|
+
|
33
|
+
!purge_this
|
34
|
+
}
|
35
|
+
purge_rss.each { |r|
|
36
|
+
info "* purge items from feed #{r}"
|
37
|
+
@dbi.execute "DELETE FROM items WHERE rss=?", r
|
38
|
+
}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
alias purge! maintenance!
|
42
|
+
end
|
data/lib/harvester/mrss.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
# Magic RSS
|
3
3
|
# Helps getting around with RSS while ignoring standards and tolerating much
|
4
|
+
# TODO: refactor out of harvester
|
4
5
|
|
5
6
|
require 'rexml/document'
|
6
7
|
require 'cgi'
|
@@ -187,6 +188,7 @@ class MRSS
|
|
187
188
|
@e.s_text "link"
|
188
189
|
end
|
189
190
|
def description
|
191
|
+
@e.rss_content("itunes:summary") ||
|
190
192
|
@e.rss_content("content:encoded") ||
|
191
193
|
@e.rss_content("encoded") ||
|
192
194
|
@e.rss_content("description")
|
@@ -291,9 +293,7 @@ class MRSS
|
|
291
293
|
k = case xml.name
|
292
294
|
when 'rss'
|
293
295
|
RSS
|
294
|
-
when
|
295
|
-
RDF
|
296
|
-
when 'RDF'
|
296
|
+
when *%w[rdf RDF]
|
297
297
|
RDF
|
298
298
|
when 'feed'
|
299
299
|
ATOM
|
@@ -303,6 +303,7 @@ class MRSS
|
|
303
303
|
k.new(xml)
|
304
304
|
end
|
305
305
|
|
306
|
+
# TODO also refactor out of mrss?
|
306
307
|
module Util
|
307
308
|
def self.detect_time(s)
|
308
309
|
tz_offset = 0
|
data/lib/harvester/post.rb
CHANGED
@@ -3,11 +3,12 @@
|
|
3
3
|
require_relative '../harvester'
|
4
4
|
|
5
5
|
class Harvester
|
6
|
-
module POST; end
|
7
6
|
|
7
|
+
POST = true
|
8
8
|
# runs the configured post processing scripts
|
9
9
|
def post!(path = nil)
|
10
|
-
|
10
|
+
info 'POST'
|
11
|
+
task 'post process' do
|
11
12
|
if post_script = path || @config['settings']['post_script']
|
12
13
|
error "Cannot find an executable script at #{ post_script }" unless test('x', post_script)
|
13
14
|
exec post_script, @config['settings']['output']
|
@@ -1,15 +1,13 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require_relative '../harvester'
|
4
|
-
require_relative '../harvester/db'
|
5
4
|
require 'gruff'
|
6
5
|
|
7
6
|
class Harvester
|
8
|
-
|
9
|
-
|
7
|
+
STATS = true
|
10
8
|
# generates a fetch statistic image
|
11
|
-
def
|
12
|
-
info "
|
9
|
+
def stats!
|
10
|
+
info "STATS"
|
13
11
|
task "generate chart" do
|
14
12
|
c = Chart::StatsPerCollection.new
|
15
13
|
@dbi.execute( File.read( sql_query(:chart) ) ).each{ |date,collection|
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: 6
|
5
|
-
version: 0.8.0.pre.
|
5
|
+
version: 0.8.0.pre.3
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- astro
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-06-
|
17
|
+
date: 2011-06-10 00:00:00 Z
|
18
18
|
dependencies:
|
19
19
|
- !ruby/object:Gem::Dependency
|
20
20
|
name: rdbi
|
@@ -131,39 +131,42 @@ email: mail@janlelis.de
|
|
131
131
|
executables:
|
132
132
|
- harvester-new
|
133
133
|
- harvester
|
134
|
+
- harvester-create
|
134
135
|
- harvester-clock
|
135
|
-
- harvester-
|
136
|
-
- harvester-chart
|
136
|
+
- harvester-stats
|
137
137
|
- harvester-run
|
138
138
|
- harvester-generate
|
139
139
|
- harvester-fetch
|
140
140
|
- harvester-jabber
|
141
141
|
- harvester-post
|
142
|
+
- harvester-maintenance
|
142
143
|
extensions: []
|
143
144
|
|
144
145
|
extra_rdoc_files: []
|
145
146
|
|
146
147
|
files:
|
148
|
+
- lib/harvester/maintenance.rb
|
147
149
|
- lib/harvester/generate.rb
|
148
150
|
- lib/harvester/generator/entity_translator.rb
|
149
151
|
- lib/harvester/generator/link_absolutizer.rb
|
150
|
-
- lib/harvester/
|
151
|
-
- lib/harvester/chart.rb
|
152
|
+
- lib/harvester/create.rb
|
152
153
|
- lib/harvester/post.rb
|
154
|
+
- lib/harvester/stats.rb
|
153
155
|
- lib/harvester/jabber.rb
|
154
156
|
- lib/harvester/fetch.rb
|
155
157
|
- lib/harvester/mrss.rb
|
156
158
|
- lib/harvester.rb
|
157
159
|
- bin/harvester-new
|
158
160
|
- bin/harvester
|
161
|
+
- bin/harvester-create
|
159
162
|
- bin/harvester-clock
|
160
|
-
- bin/harvester-
|
161
|
-
- bin/harvester-chart
|
163
|
+
- bin/harvester-stats
|
162
164
|
- bin/harvester-run
|
163
165
|
- bin/harvester-generate
|
164
166
|
- bin/harvester-fetch
|
165
167
|
- bin/harvester-jabber
|
166
168
|
- bin/harvester-post
|
169
|
+
- bin/harvester-maintenance
|
167
170
|
- README.rdoc
|
168
171
|
- CHANGELOG.rdoc
|
169
172
|
- data/ent/HTMLsymbol.ent
|
data/bin/harvester-chart
DELETED
data/bin/harvester-db
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# encoding: utf-8
|
3
|
-
|
4
|
-
require_relative '../lib/harvester/db'
|
5
|
-
harve = Harvester.new_from_argv
|
6
|
-
|
7
|
-
case action = ARGV.shift
|
8
|
-
when 'create'
|
9
|
-
harve.create!
|
10
|
-
when 'maintenance'
|
11
|
-
harve.maintenance!
|
12
|
-
else
|
13
|
-
puts '[DB] Do nothing...'
|
14
|
-
end
|
15
|
-
|
data/lib/harvester/db.rb
DELETED
@@ -1,123 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require_relative '../harvester'
|
4
|
-
|
5
|
-
class Harvester
|
6
|
-
module DB; end
|
7
|
-
|
8
|
-
# creates required database structure
|
9
|
-
def create!
|
10
|
-
task "create database tables" do
|
11
|
-
begin @dbi.transaction do
|
12
|
-
sql_queries(:create).each{ |sql|
|
13
|
-
info "* execute " + File.basename(sql)
|
14
|
-
@dbi.execute File.read(sql)
|
15
|
-
}
|
16
|
-
end; end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# check for feed source changes
|
21
|
-
def maintenance!
|
22
|
-
task "look for sources to purge" do
|
23
|
-
purge = []
|
24
|
-
@dbi.execute("SELECT collection, rss FROM sources").each{ |dbc,dbr|
|
25
|
-
purge << [dbc, dbr] unless (@collections[dbc] || []).include? dbr
|
26
|
-
}
|
27
|
-
|
28
|
-
purge_rss = []
|
29
|
-
purge.each { |c,r|
|
30
|
-
info "* remove #{c}:#{r}..."
|
31
|
-
@dbi.execute "DELETE FROM sources WHERE collection=? AND rss=?", c, r
|
32
|
-
purge_rss << r
|
33
|
-
}
|
34
|
-
|
35
|
-
purge_rss.delete_if { |r|
|
36
|
-
purge_this = true
|
37
|
-
|
38
|
-
@collections.each { |cfc,cfr|
|
39
|
-
if purge_this
|
40
|
-
warn "* must keep #{r} because it's still in #{cfc}" if cfr && cfr.include?(r)
|
41
|
-
purge_this = !(cfr && cfr.include?(r))
|
42
|
-
end
|
43
|
-
}
|
44
|
-
|
45
|
-
!purge_this
|
46
|
-
}
|
47
|
-
purge_rss.each { |r|
|
48
|
-
info "* purge items from feed #{r}"
|
49
|
-
@dbi.execute "DELETE FROM items WHERE rss=?", r
|
50
|
-
}
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
private
|
55
|
-
|
56
|
-
def update(rss_url, new_source, collection, response, rss_url_nice = rss_url)
|
57
|
-
rss = MRSS.parse(response)
|
58
|
-
|
59
|
-
begin @dbi.transaction do
|
60
|
-
# update source
|
61
|
-
if new_source
|
62
|
-
@dbi.execute "INSERT INTO sources (collection, rss, last, title, link, description) VALUES (?, ?, ?, ?, ?, ?)",
|
63
|
-
collection, rss_url, response['Last-Modified'], rss.title, rss.link, rss.description
|
64
|
-
info rss_url_nice + "Added as source"
|
65
|
-
else
|
66
|
-
@dbi.execute "UPDATE sources SET last=?, title=?, link=?, description=? WHERE collection=? AND rss=?",
|
67
|
-
response['Last-Modified'], rss.title, rss.link, rss.description, collection, rss_url
|
68
|
-
debug rss_url_nice + "Source updated"
|
69
|
-
end
|
70
|
-
|
71
|
-
# update items
|
72
|
-
items_new, items_updated = 0, 0
|
73
|
-
rss.items.each { |item|
|
74
|
-
description = item.description
|
75
|
-
|
76
|
-
# Link mangling
|
77
|
-
begin
|
78
|
-
link = URI::join((rss.link.to_s == '') ? uri.to_s : rss.link.to_s, item.link || rss.link).to_s
|
79
|
-
rescue URI::Error
|
80
|
-
link = item.link
|
81
|
-
end
|
82
|
-
|
83
|
-
# Push into database
|
84
|
-
db_title, = *@dbi.execute("SELECT title FROM items WHERE rss=? AND link=?", rss_url, link).fetch
|
85
|
-
|
86
|
-
if db_title.nil? || db_title.empty? # item is new
|
87
|
-
begin
|
88
|
-
@dbi.execute "INSERT INTO items (rss, title, link, date, description) VALUES (?, ?, ?, ?, ?)",
|
89
|
-
rss_url, item.title, link, item.date.to_s, description
|
90
|
-
items_new += 1
|
91
|
-
#rescue DBI::ProgrammingError
|
92
|
-
# puts description
|
93
|
-
# puts "#{$!.class}: #{$!}\n#{$!.backtrace.join("\n")}"
|
94
|
-
end
|
95
|
-
else
|
96
|
-
@dbi.execute "UPDATE items SET title=?, description=?, date=? WHERE rss=? AND link=?",
|
97
|
-
item.title, description, item.date.to_s, rss_url, link
|
98
|
-
items_updated += 1
|
99
|
-
end
|
100
|
-
|
101
|
-
# Remove all enclosures
|
102
|
-
@dbi.execute "DELETE FROM enclosures WHERE rss=? AND link=?", rss_url, link
|
103
|
-
|
104
|
-
# Re-add all enclosures
|
105
|
-
item.enclosures.each do |enclosure|
|
106
|
-
href = URI::join((rss.link.to_s == '') ? link.to_s : rss.link.to_s, enclosure['href']).to_s
|
107
|
-
@dbi.execute "INSERT INTO enclosures (rss, link, href, mime, title, length) VALUES (?, ?, ?, ?, ?, ?)",
|
108
|
-
rss_url, link, href, enclosure['type'], enclosure['title'],
|
109
|
-
!enclosure['length'] || enclosure['length'].empty? ? 0 : enclosure['length']
|
110
|
-
end
|
111
|
-
}
|
112
|
-
info rss_url_nice + "#{ items_new } new items, #{ items_updated } updated"
|
113
|
-
end; end
|
114
|
-
end
|
115
|
-
|
116
|
-
def sql_queries(task)
|
117
|
-
Dir[ File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }*.sql" ].each
|
118
|
-
end
|
119
|
-
|
120
|
-
def sql_query(task)
|
121
|
-
File.dirname(__FILE__) + "/../../data/sql/#{ @config['db']['driver'].downcase }/#{ task }.sql"
|
122
|
-
end
|
123
|
-
end
|