abelard 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/abelard +12 -0
- data/lib/abelard/archive.rb +17 -0
- data/lib/abelard/dir.rb +166 -0
- data/lib/abelard/dump.rb +6 -0
- data/lib/abelard/history.rb +142 -0
- data/lib/abelard/list.rb +31 -0
- data/lib/abelard/load.rb +360 -0
- data/lib/abelard/postxml.rb +80 -0
- data/lib/abelard/web.rb +35 -0
- metadata +67 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 11d17040bfbcb446c38d93ce3ddaf614a55fb7c1
|
4
|
+
data.tar.gz: b40187d7a1c261912e3e2655e7783a6e4b9915c7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cb41b5744e105884515e1e8e8513bc6601c132a4b332e7981f6fe038df48b7ff7cb797bac6f3ac6c9a5d20d62656c1c46d7f50c361dce387b341591265086edc
|
7
|
+
data.tar.gz: be36ec1cb0beb93e410226221bbe4faacb95aed8f156aec17b2be2b8c6f4803939caa939ef01800463f1119da6c296ceafc5bedc17cbd7165e9c6074375b37e0
|
data/bin/abelard
ADDED
data/lib/abelard/dir.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
require 'time'
|
3
|
+
require 'abelard/history'
|
4
|
+
|
5
|
+
# known namespaces for xpath search
|
6
|
+
NS = [
|
7
|
+
"atom:http://www.w3.org/2005/Atom",
|
8
|
+
"dc:http://purl.org/dc/elements/1.1/",
|
9
|
+
"app:http://purl.org/atom/app#",
|
10
|
+
"wp:http://wordpress.org/export/1.2/"
|
11
|
+
]
|
12
|
+
|
13
|
+
class Item
|
14
|
+
attr_accessor :timestamp, :title, :file, :doc, :author, :status
|
15
|
+
def initialize(xml, filename)
|
16
|
+
@doc = xml
|
17
|
+
@file = filename
|
18
|
+
timestamp_node = doc.find_first("/atom:entry/atom:published", NS) ||
|
19
|
+
doc.find_first("/item/pubDate")
|
20
|
+
if timestamp_node
|
21
|
+
@timestamp = Time.parse(timestamp_node.content)
|
22
|
+
else
|
23
|
+
@timestamp = Time.new(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
title_node = doc.find_first("/atom:entry/atom:title", NS) ||
|
27
|
+
doc.find_first("/item/title")
|
28
|
+
if title_node
|
29
|
+
@title = title_node.content
|
30
|
+
else
|
31
|
+
@title = "Post"
|
32
|
+
end
|
33
|
+
|
34
|
+
author_node = doc.find_first("/atom:entry/atom:author/atom:name", NS) ||
|
35
|
+
doc.find_first("/item/dc:creator", NS)
|
36
|
+
if author_node
|
37
|
+
@author = author_node.content
|
38
|
+
else
|
39
|
+
@author = 'abelard'
|
40
|
+
end
|
41
|
+
|
42
|
+
@status = :published
|
43
|
+
status_node = doc.find_first("/item/wp:status", NS)
|
44
|
+
if status_node
|
45
|
+
$stderr.puts("raw status #{status_node.content}")
|
46
|
+
if status_node.content == "trash"
|
47
|
+
@status = :trash
|
48
|
+
elsif status_node.content == "draft"
|
49
|
+
@status = :draft
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
draft_node = doc.find_first("/atom:entry/app:control/app:draft", NS)
|
54
|
+
if draft_node
|
55
|
+
if draft_node.content == "yes"
|
56
|
+
@status = :draft
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def save
|
62
|
+
puts("writing #{file}")
|
63
|
+
doc.save(file, :indent => true, :encoding => LibXML::XML::Encoding::UTF_8)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
class Directory
|
68
|
+
def initialize(path)
|
69
|
+
@path = path
|
70
|
+
@base_doc = read_base_doc
|
71
|
+
@feed_type = case @base_doc.root.name
|
72
|
+
when "feed"
|
73
|
+
:atom
|
74
|
+
when "rss"
|
75
|
+
:rss
|
76
|
+
else
|
77
|
+
:unknown
|
78
|
+
end
|
79
|
+
|
80
|
+
@git = History.new(self, path)
|
81
|
+
end
|
82
|
+
|
83
|
+
def save
|
84
|
+
@git.commit_posts
|
85
|
+
end
|
86
|
+
|
87
|
+
def read_base_doc
|
88
|
+
feed = LibXML::XML::Parser.file("#{@path}/feed.xml").parse
|
89
|
+
if feed.root.name == "rss"
|
90
|
+
LibXML::XML::Parser.file("#{@path}/channel-1.xml").parse
|
91
|
+
else
|
92
|
+
feed
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def base_doc
|
97
|
+
if ! @base_doc
|
98
|
+
@base_doc = read_base_doc
|
99
|
+
end
|
100
|
+
@base_doc
|
101
|
+
end
|
102
|
+
|
103
|
+
# iterates the Item objects for the feed, in order
|
104
|
+
def each
|
105
|
+
by_date = {}
|
106
|
+
each_unsorted do |post,filename|
|
107
|
+
item = Item.new(post,filename)
|
108
|
+
by_date[item.timestamp] = item
|
109
|
+
end
|
110
|
+
by_date.keys.sort.map { |dt| yield by_date[dt] }
|
111
|
+
end
|
112
|
+
|
113
|
+
def info
|
114
|
+
inf = {}
|
115
|
+
el = base_doc.find_first("/atom:feed/atom:title", NS) ||
|
116
|
+
base_doc.find_first("/rss/channel/title")
|
117
|
+
inf["title"] = el.content
|
118
|
+
inf
|
119
|
+
end
|
120
|
+
|
121
|
+
def posts_feed
|
122
|
+
feed = read_base_doc
|
123
|
+
case @feed_type
|
124
|
+
when :atom
|
125
|
+
posts_feed_atom(feed)
|
126
|
+
when :rss
|
127
|
+
posts_feed_rss(feed)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def insert_posts(collection)
|
132
|
+
each do |post|
|
133
|
+
$stderr.puts "adding #{post.file}"
|
134
|
+
collection << collection.doc.import(post.doc.root)
|
135
|
+
end
|
136
|
+
collection
|
137
|
+
end
|
138
|
+
|
139
|
+
def each_unsorted
|
140
|
+
Dir.glob("#{@path}/post-*.xml") do |filename|
|
141
|
+
post = LibXML::XML::Parser.file(filename).parse
|
142
|
+
yield post, filename
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def posts_feed_atom(doc)
|
147
|
+
insert_posts(doc.root)
|
148
|
+
doc
|
149
|
+
end
|
150
|
+
|
151
|
+
def posts_feed_rss(rssdoc)
|
152
|
+
doc = LibXML::XML::Parser.file("#{@path}/channel-1.xml").parse
|
153
|
+
channel = doc.find_first("/rss/channel");
|
154
|
+
insert_posts(channel)
|
155
|
+
doc
|
156
|
+
end
|
157
|
+
|
158
|
+
def sort_entries(repo_entries)
|
159
|
+
by_date = repo_entries.map do |e|
|
160
|
+
{ :entry => e,
|
161
|
+
:time => Item.new(LibXML::XML::Parser.file(e.path).parse, e.path ).timestamp }
|
162
|
+
end
|
163
|
+
by_date.sort! { |a,b| a[:time] <=> b[:time] }
|
164
|
+
by_date.map { |hash| hash[:entry] }
|
165
|
+
end
|
166
|
+
end
|
data/lib/abelard/dump.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
# Manage a git repository representing the feed
|
2
|
+
|
3
|
+
require 'rugged'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
class History
|
7
|
+
# archive is a Directory, dir is a path to store in git
|
8
|
+
def initialize(archive, dir)
|
9
|
+
@archive = archive
|
10
|
+
if File.directory? dir
|
11
|
+
begin
|
12
|
+
repo = Rugged::Repository.discover(dir)
|
13
|
+
|
14
|
+
repo_base = Pathname.new(repo.workdir).realpath.to_s
|
15
|
+
real_dir = Pathname.new(dir).realpath.to_s
|
16
|
+
raise "confused! #{repo_base} #{real_dir}" unless real_dir.start_with?(repo_base)
|
17
|
+
@relative_root = real_dir[repo_base.length+1..-1] || ""
|
18
|
+
$stderr.puts "#{real_dir} in #{repo_base} : #{@relative_root}"
|
19
|
+
|
20
|
+
check_repo_clean(repo, @relative_root)
|
21
|
+
rescue Rugged::RepositoryError
|
22
|
+
repo = Rugged::Repository.init_at(dir)
|
23
|
+
@relative_root = ""
|
24
|
+
end
|
25
|
+
elsif File.exist? dir
|
26
|
+
fail "#{dir} exists as file"
|
27
|
+
else
|
28
|
+
Dir.mkdir(dir)
|
29
|
+
repo = Rugged::Repository.init_at(dir)
|
30
|
+
@relative_root = ""
|
31
|
+
end
|
32
|
+
@repo = repo
|
33
|
+
@dir_path = dir
|
34
|
+
end
|
35
|
+
|
36
|
+
class Entry
|
37
|
+
# dir_fn is the path to the file relative to the feed directory
|
38
|
+
# git_fn is the path to the file relative to the git root
|
39
|
+
# path is the full path
|
40
|
+
|
41
|
+
attr_reader :git_fn, :dir_fn, :path
|
42
|
+
def initialize(f, root, repository)
|
43
|
+
@git_fn = f
|
44
|
+
|
45
|
+
@dir_fn = if root.empty?
|
46
|
+
f
|
47
|
+
else
|
48
|
+
f[root.length+1..-1]
|
49
|
+
end
|
50
|
+
|
51
|
+
@path = repository.workdir + '/' + @dir_fn
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def entry(from_git)
|
56
|
+
Entry.new(from_git, @relative_root, @repo)
|
57
|
+
end
|
58
|
+
|
59
|
+
def check_repo_clean(repo, sub)
|
60
|
+
$stderr.puts "check_repo_clean(#{repo},#{sub})"
|
61
|
+
clean = true
|
62
|
+
repo.status do |file, data|
|
63
|
+
change = classify_file(sub, file)
|
64
|
+
clean = false if change == :real
|
65
|
+
end
|
66
|
+
clean
|
67
|
+
end
|
68
|
+
|
69
|
+
def commit_posts
|
70
|
+
repo = @repo
|
71
|
+
sub = @relative_root
|
72
|
+
|
73
|
+
commits = 0
|
74
|
+
todo = { real: [] }
|
75
|
+
@repo.status do |file, data|
|
76
|
+
change = classify_file(sub, file)
|
77
|
+
todo[change] ||= []
|
78
|
+
todo[change] << file
|
79
|
+
end
|
80
|
+
if todo[:top]
|
81
|
+
todo[:top].each do |file|
|
82
|
+
repo.index.add file
|
83
|
+
repo_entry = entry(file)
|
84
|
+
item = Item.new(LibXML::XML::Parser.file(repo_entry.path).parse, repo_entry.path)
|
85
|
+
|
86
|
+
author = {:email => "#{item.author}@example.org",
|
87
|
+
:time => item.timestamp,
|
88
|
+
:name => item.author}
|
89
|
+
parents = []
|
90
|
+
parents << repo.head.target unless repo.head_unborn?
|
91
|
+
commit = Rugged::Commit.create(repo,
|
92
|
+
:author => author,
|
93
|
+
:message => "feed info",
|
94
|
+
:committer => author,
|
95
|
+
:parents => parents,
|
96
|
+
:tree => repo.index.write_tree(repo),
|
97
|
+
:update_ref => "HEAD")
|
98
|
+
commits = commits+1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
to_commit = @archive.sort_entries(todo[:real].map { |f| entry(f) })
|
103
|
+
|
104
|
+
to_commit.each do |entry|
|
105
|
+
file = entry.git_fn
|
106
|
+
repo.index.add file
|
107
|
+
repo_entry = entry(file)
|
108
|
+
item = Item.new(LibXML::XML::Parser.file(repo_entry.path).parse, repo_entry.path)
|
109
|
+
|
110
|
+
author = {:email => "#{item.author}@example.org",
|
111
|
+
:time => item.timestamp,
|
112
|
+
:name => item.author}
|
113
|
+
|
114
|
+
$stderr.puts "Adding #{file}"
|
115
|
+
|
116
|
+
commit = Rugged::Commit.create(repo,
|
117
|
+
:author => author,
|
118
|
+
:message => "post",
|
119
|
+
:committer => author,
|
120
|
+
:parents => [repo.head.target],
|
121
|
+
:tree => repo.index.write_tree(repo),
|
122
|
+
:update_ref => "HEAD")
|
123
|
+
commits = commits+1
|
124
|
+
end
|
125
|
+
|
126
|
+
repo.index.write if commits > 0
|
127
|
+
end
|
128
|
+
|
129
|
+
def classify_file(subdir, file)
|
130
|
+
# normally 1 archive = 1 repo, but if you have a repo of several
|
131
|
+
# archives, ignore file changes outside
|
132
|
+
return :outside unless file.start_with?(subdir)
|
133
|
+
|
134
|
+
filename = Pathname.new(file).basename.to_s
|
135
|
+
|
136
|
+
return :real if filename.start_with?("post-") or filename.start_with?("comment-")
|
137
|
+
return :top if filename.start_with?("feed") or filename.start_with?("channel")
|
138
|
+
|
139
|
+
return :unknown
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
data/lib/abelard/list.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'abelard/dir.rb'
|
5
|
+
CONFIG_FILE = "blogfeeds.yaml"
|
6
|
+
|
7
|
+
dest=''
|
8
|
+
if ARGV.length > 0 then
|
9
|
+
if (ARGV[0] == '-h') then
|
10
|
+
$stderr.puts("abelard list -d <dir>\nabelard list <config-entry>
|
11
|
+
abelard list\n")
|
12
|
+
else
|
13
|
+
if (ARGV[0] == '-d') then
|
14
|
+
dest = ARGV[1]
|
15
|
+
else
|
16
|
+
configs = YAML.load_file(CONFIG_FILE)
|
17
|
+
conf = configs[ARGV[0]]
|
18
|
+
dest = conf['dest']
|
19
|
+
end
|
20
|
+
dir = Directory.new(dest)
|
21
|
+
dir.each do |item|
|
22
|
+
printf("%s %s\n", item.timestamp.strftime("%Y-%m-%d"), item.title)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
else
|
26
|
+
configs = YAML.load_file(CONFIG_FILE)
|
27
|
+
configs.each do |name, conf|
|
28
|
+
puts("#{name}: #{conf['urls'].first}")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
data/lib/abelard/load.rb
ADDED
@@ -0,0 +1,360 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'libxml'
|
3
|
+
require 'net/http'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
require 'abelard/dir.rb'
|
7
|
+
require 'abelard/postxml.rb'
|
8
|
+
|
9
|
+
CONFIG_FILE = "blogfeeds.yaml"
|
10
|
+
|
11
|
+
Debug = true
|
12
|
+
|
13
|
+
|
14
|
+
module Splitter
|
15
|
+
def item(xmlnode, filename)
|
16
|
+
filedoc = LibXML::XML::Document.new()
|
17
|
+
filedoc.root = xmlnode.copy(true)
|
18
|
+
item = Item.new(filedoc, filename)
|
19
|
+
end
|
20
|
+
|
21
|
+
# deprecated
|
22
|
+
def write_item(xmlnode, file)
|
23
|
+
filedoc = LibXML::XML::Document.new()
|
24
|
+
filedoc.root = xmlnode.copy(true)
|
25
|
+
puts("writing #{file}")
|
26
|
+
filedoc.save(file, :indent => true, :encoding => LibXML::XML::Encoding::UTF_8)
|
27
|
+
end
|
28
|
+
|
29
|
+
# stream the document to a string and reparse it to clean up redundant namespaces
|
30
|
+
def write_doc_clean(doc, file)
|
31
|
+
cleandoc = LibXML::XML::Parser.string(doc.to_s, :options => LibXML::XML::Parser::Options::NSCLEAN).parse
|
32
|
+
cleandoc.save(file)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class Splitter_rss
|
37
|
+
include Splitter
|
38
|
+
NS = ['wp:http://wordpress.org/export/1.2/'];
|
39
|
+
|
40
|
+
def initialize(document, destination)
|
41
|
+
@doc = document
|
42
|
+
@dest = destination
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_comments(item)
|
46
|
+
# In a wordpress export, the comments are in wp:comment elements
|
47
|
+
basename = Post_id_rss.new(item)
|
48
|
+
|
49
|
+
all = []
|
50
|
+
comment_nodes = item.find("wp:comment", NS)
|
51
|
+
comment_nodes.each do |node|
|
52
|
+
comment_doc = LibXML::XML::Document.new()
|
53
|
+
comment_doc.root = node.remove!
|
54
|
+
approved = comment_doc.find_first("/wp:comment/wp:comment_approved", NS)
|
55
|
+
author_email = comment_doc.find_first("/wp:comment/wp:comment_author_email", NS)
|
56
|
+
author_ip = comment_doc.find_first("/wp:comment/wp:comment_author_IP", NS)
|
57
|
+
id = comment_doc.find_first("/wp:comment/wp:comment_id", NS)
|
58
|
+
|
59
|
+
# delete some sensitive fields
|
60
|
+
author_email.remove! if (author_email)
|
61
|
+
author_ip.remove! if (author_ip)
|
62
|
+
|
63
|
+
if (approved && (approved.content == '1'))
|
64
|
+
filename = basename.as_comment(id.content)
|
65
|
+
|
66
|
+
all << Item.new(comment_doc, "#{@dest}/#{filename}")
|
67
|
+
end
|
68
|
+
end
|
69
|
+
all
|
70
|
+
end
|
71
|
+
|
72
|
+
def split_items
|
73
|
+
channel_count = 1
|
74
|
+
rss = @doc.root
|
75
|
+
@parent = LibXML::XML::Document.new()
|
76
|
+
root = LibXML::XML::Node.new(rss.name)
|
77
|
+
@parent.root = root
|
78
|
+
rss.attributes.each { |a| root.attributes[a.name] = a.value }
|
79
|
+
rss.children.select(&:element?).each do |channel|
|
80
|
+
if (channel.name == "channel")
|
81
|
+
root << channel.clone # shallow copy for feed.xml
|
82
|
+
|
83
|
+
channelself = XmlUtil::self_link(channel)
|
84
|
+
is_comments = (channelself =~ /comments/)
|
85
|
+
|
86
|
+
copy = LibXML::XML::Node.new(channel.name)
|
87
|
+
channel.attributes.each { |a| copy.attributes[a.name] = a.value }
|
88
|
+
channel.children.select(&:element?).each do |node|
|
89
|
+
$stderr.puts(node.name)
|
90
|
+
if (node.name == "item")
|
91
|
+
# attachments dont get saved as posts
|
92
|
+
if ( node.find("wp:attachment_url", "wp:http://wordpress.org/export/1.2/").length > 0 )
|
93
|
+
$stderr.puts("skipping attachment")
|
94
|
+
else
|
95
|
+
# in a wordpress export file, comments are included inside the post item
|
96
|
+
comments = extract_comments(node)
|
97
|
+
save(node)
|
98
|
+
comments.each { |c| c.save }
|
99
|
+
end
|
100
|
+
else
|
101
|
+
copy << node.copy(true)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
ch_copy = root.copy(true)
|
105
|
+
ch_copy << copy
|
106
|
+
unless is_comments
|
107
|
+
channel_doc = LibXML::XML::Document.new()
|
108
|
+
channel_doc.root = ch_copy
|
109
|
+
channel_doc.save("#{@dest}/channel-#{channel_count}.xml")
|
110
|
+
end
|
111
|
+
channel_count = channel_count + 1
|
112
|
+
else
|
113
|
+
root << channel
|
114
|
+
end
|
115
|
+
end
|
116
|
+
@parent.save("#{@dest}/feed.xml")
|
117
|
+
end
|
118
|
+
|
119
|
+
def save(node)
|
120
|
+
filename = Post_id_rss.new(node).to_s
|
121
|
+
new_item = item(node, "#{@dest}/#{filename}")
|
122
|
+
if new_item.status == :published
|
123
|
+
new_item.save
|
124
|
+
else
|
125
|
+
$stderr.puts("skipping #{filename} as status #{new_item.status}")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
class Splitter_atom
|
131
|
+
include Splitter
|
132
|
+
def initialize(document, destination)
|
133
|
+
@doc = document
|
134
|
+
@dest = destination
|
135
|
+
end
|
136
|
+
|
137
|
+
def split_items
|
138
|
+
feed = @doc.root
|
139
|
+
|
140
|
+
feedself = XmlUtil::self_link(feed)
|
141
|
+
|
142
|
+
@feed_type = nil # unknown
|
143
|
+
@feed_type = "post" if (feedself =~ %r{/posts/default$})
|
144
|
+
@feed_type = "comment" if (feedself =~ %r{/comments/default$})
|
145
|
+
|
146
|
+
@parent = LibXML::XML::Document.new()
|
147
|
+
root = LibXML::XML::Node.new(feed.name)
|
148
|
+
@parent.root = root
|
149
|
+
feed.namespaces.definitions.each {|ns| LibXML::XML::Namespace.new(root, ns.prefix, ns.href)}
|
150
|
+
feed.attributes.each { |a| root.attributes[a.name] = a.value }
|
151
|
+
|
152
|
+
feed.children.select(&:element?).each do |node|
|
153
|
+
if (node.name == "entry")
|
154
|
+
save(node)
|
155
|
+
else
|
156
|
+
root << @parent.import(node)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
write_doc_clean(@parent, "#{@dest}/feed.xml")
|
161
|
+
end
|
162
|
+
|
163
|
+
def save(node)
|
164
|
+
id = node.children.find { |n| n.name == "id" }
|
165
|
+
id = id && id.content
|
166
|
+
|
167
|
+
path = XmlUtil::self_link(node)
|
168
|
+
|
169
|
+
case node.name
|
170
|
+
when "entry"
|
171
|
+
category = XmlUtil::child_attribute(node, "category", "term")
|
172
|
+
|
173
|
+
if @feed_type
|
174
|
+
entry_type = @feed_type
|
175
|
+
else
|
176
|
+
entry_type = category.split('#').last if category
|
177
|
+
end
|
178
|
+
|
179
|
+
case entry_type
|
180
|
+
when "post"
|
181
|
+
postnumber = path.split('/').last
|
182
|
+
filename = "#{@dest}/post-#{postnumber}.xml"
|
183
|
+
write_item(node, filename)
|
184
|
+
when "comment"
|
185
|
+
pathsplit = path.split('/')
|
186
|
+
postnumber = pathsplit[-4]
|
187
|
+
commentnumber = pathsplit[-1]
|
188
|
+
filename = "#{@dest}/comment-#{postnumber}-#{commentnumber}.xml"
|
189
|
+
write_item(node,filename)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
class SourceBase
|
196
|
+
def process(parser, destination)
|
197
|
+
doc = parser.parse
|
198
|
+
|
199
|
+
case doc.root.name
|
200
|
+
when "feed"
|
201
|
+
atom = Splitter_atom.new(doc, destination)
|
202
|
+
atom.split_items
|
203
|
+
when "rss"
|
204
|
+
rss = Splitter_rss.new(doc, destination)
|
205
|
+
rss.split_items
|
206
|
+
else
|
207
|
+
puts "don't know what to do with element #{doc.root.name}"
|
208
|
+
end
|
209
|
+
|
210
|
+
archive = Directory.new(destination)
|
211
|
+
archive.save
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
class FileSource < SourceBase
|
216
|
+
def initialize(filename, dest)
|
217
|
+
@file = filename
|
218
|
+
@dest = dest
|
219
|
+
end
|
220
|
+
|
221
|
+
def load
|
222
|
+
ensure_dest(@dest)
|
223
|
+
parser = LibXML::XML::Parser.file(@file)
|
224
|
+
process(parser, @dest)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
class Source < SourceBase
|
229
|
+
def initialize(conf_in)
|
230
|
+
if conf_in.respond_to?(:keys)
|
231
|
+
@conf = conf_in
|
232
|
+
else
|
233
|
+
@conf = get_config(conf_in) || die("No config for #{conf_in}")
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def load
|
238
|
+
conf = @conf
|
239
|
+
dest = conf["dest"] || die("No 'dest' directory defined")
|
240
|
+
urls = conf["urls"] || die("No urls defined")
|
241
|
+
|
242
|
+
ensure_dest(dest)
|
243
|
+
|
244
|
+
fetcher = Fetcher.new
|
245
|
+
fetcher.user = conf["user"]
|
246
|
+
fetcher.password = conf["password"]
|
247
|
+
|
248
|
+
urls.each do |urlpath|
|
249
|
+
url = URI(urlpath)
|
250
|
+
fetcher.get(url) do |response|
|
251
|
+
write_raw(response.body, "#{dest}/raw.xml") if Debug
|
252
|
+
parser = LibXML::XML::Parser.string(response.body)
|
253
|
+
process(parser, dest)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def all_configs
|
259
|
+
YAML.load_file(CONFIG_FILE) || {}
|
260
|
+
end
|
261
|
+
|
262
|
+
def get_config(key)
|
263
|
+
configuration_file = all_configs
|
264
|
+
configuration_file[key]
|
265
|
+
end
|
266
|
+
|
267
|
+
def save_config(key, conf)
|
268
|
+
configuration_data = all_configs
|
269
|
+
if configuration_data.has_key? key
|
270
|
+
die("Already have config for #{key}")
|
271
|
+
else
|
272
|
+
configuration_data[key] = conf
|
273
|
+
open(CONFIG_FILE, 'w+') do |conf_file|
|
274
|
+
conf_file.puts(configuration_data.to_yaml)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def write_raw(data, filename)
|
280
|
+
File.open(filename, "w") { |f| f.write(data) }
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def die(error)
|
285
|
+
puts error
|
286
|
+
exit 1
|
287
|
+
end
|
288
|
+
|
289
|
+
# Wrap an HTTP session, handle making a request and
|
290
|
+
# following any redirects
|
291
|
+
class Fetcher
|
292
|
+
def initialize
|
293
|
+
@host = nil
|
294
|
+
@session = nil
|
295
|
+
@user = nil
|
296
|
+
@password = nil
|
297
|
+
end
|
298
|
+
attr_accessor :user, :password
|
299
|
+
|
300
|
+
def get(url, max_depth=3)
|
301
|
+
if (url.host != @host)
|
302
|
+
@host = url.host
|
303
|
+
@session = Net::HTTP.new(@host, url.port)
|
304
|
+
@session.use_ssl = (url.scheme == 'https')
|
305
|
+
end
|
306
|
+
request = Net::HTTP::Get.new(url)
|
307
|
+
request.basic_auth(user, password) if user
|
308
|
+
|
309
|
+
msg = "Reading (#{url.to_s})"
|
310
|
+
msg << " as #{user}" if user
|
311
|
+
$stderr.puts(msg)
|
312
|
+
|
313
|
+
feedxml = @session.request(request)
|
314
|
+
if (feedxml.is_a? Net::HTTPOK)
|
315
|
+
yield feedxml
|
316
|
+
elsif (feedxml.is_a? Net::HTTPMovedPermanently )
|
317
|
+
new_url = feedxml['Location']
|
318
|
+
if ( new_url == url.to_s ) then
|
319
|
+
puts("Confused! redirect to same url #{new_url}")
|
320
|
+
else
|
321
|
+
if ( max_depth == 0 )
|
322
|
+
puts("Too many redirects")
|
323
|
+
else
|
324
|
+
puts("Redirecting to #{new_url}")
|
325
|
+
get(URI(new_url), max_depth-1) { |r| yield r }
|
326
|
+
end
|
327
|
+
end
|
328
|
+
else
|
329
|
+
puts("GET returned #{feedxml.code}")
|
330
|
+
puts(feedxml.body)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
def ensure_dest(dest)
|
336
|
+
Dir::mkdir(dest) unless File.directory?(dest)
|
337
|
+
unless File.directory?(dest)
|
338
|
+
$stderr.puts "Could not create directory #{dest}"
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
if ARGV.empty?
|
343
|
+
$stderr.puts "Syntax:\nabelard load -f <filename> <destination>\nabelard load {-n <url>} ... <destination>\nabelard load <config>"
|
344
|
+
exit(1)
|
345
|
+
elsif ARGV[0] == "-f"
|
346
|
+
source = FileSource.new(ARGV[1], ARGV[2])
|
347
|
+
elsif ARGV[0] == '-n'
|
348
|
+
urls = []
|
349
|
+
while ARGV[0] == '-n'
|
350
|
+
ARGV.shift
|
351
|
+
urls << ARGV.shift
|
352
|
+
end
|
353
|
+
conf = {"urls" => urls, "dest" => ARGV[0]}
|
354
|
+
source = Source.new(conf)
|
355
|
+
source.save_config(conf["dest"],conf)
|
356
|
+
else
|
357
|
+
key = ARGV[0]
|
358
|
+
source = Source.new(key)
|
359
|
+
end
|
360
|
+
source.load
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module XmlUtil
|
4
|
+
def self.child_content(node, elementname)
|
5
|
+
el = node.children.find { |n| n.name == elementname }
|
6
|
+
el && el.content
|
7
|
+
end
|
8
|
+
def self.child_attribute(node, elementname, attributename)
|
9
|
+
el = node.children.find { |n| n.name == elementname }
|
10
|
+
attr = el && el.attributes.get_attribute("term")
|
11
|
+
attr && attr.value
|
12
|
+
end
|
13
|
+
def self.with_attribute(node, attributename, attributevalue)
|
14
|
+
a = node.attributes.get_attribute(attributename)
|
15
|
+
a && (a.value == attributevalue)
|
16
|
+
end
|
17
|
+
def self.self_link(node)
|
18
|
+
el = node.children.find { |l| (l.name == "link") && with_attribute(l, "rel", "self") }
|
19
|
+
el && el.attributes.get_attribute("href").value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
class Post_id_rss
|
25
|
+
attr_reader :idurl
|
26
|
+
def initialize(postxml)
|
27
|
+
#XmlUtil::child_content(postxml, "post_id") ||
|
28
|
+
@idurl = XmlUtil::child_content(postxml, "guid")
|
29
|
+
@raw = postxml.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_s
|
33
|
+
if !idurl
|
34
|
+
improvise
|
35
|
+
else
|
36
|
+
postnumber = post_match
|
37
|
+
commenturl = /\?p(age_id)?=(\d+)(\.xml)?#comment-(.*)$/.match(idurl) ||
|
38
|
+
/^(.*)\/(\d{4}\/.*)\/#(comment)-(.*)$/.match(idurl)
|
39
|
+
|
40
|
+
if commenturl
|
41
|
+
postnumber = commenturl[2].sub(/^\//,'').sub(/\.xml$/,'').gsub('/','-')
|
42
|
+
commentnumber = commenturl[4]
|
43
|
+
"comment-#{postnumber}-#{commentnumber}.xml"
|
44
|
+
else
|
45
|
+
"post-#{postnumber}.xml"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def post_match
|
51
|
+
posturl = /\?p(age_id)?=(\d+)(\.xml)?$/.match(idurl)
|
52
|
+
if posturl
|
53
|
+
posturl[2]
|
54
|
+
else
|
55
|
+
sanitize
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def as_comment(commentnumber)
|
60
|
+
"comment-#{post_match}-#{commentnumber}.xml"
|
61
|
+
end
|
62
|
+
|
63
|
+
def sanitize
|
64
|
+
uri = URI(idurl)
|
65
|
+
$stderr.puts("Could not parse url #{idurl}") unless ( uri )
|
66
|
+
if ( uri.scheme == "tag" )
|
67
|
+
return idurl.split('-').last
|
68
|
+
end
|
69
|
+
|
70
|
+
build = uri.path.sub(/^\//,'').sub(/\.xml$/,'').gsub('/','-')
|
71
|
+
build.concat('-' + uri.query.gsub(/[?&]/,'-')) if uri.query
|
72
|
+
build.concat('-' + uri.fragment) if uri.fragment
|
73
|
+
build
|
74
|
+
end
|
75
|
+
|
76
|
+
def improvise
|
77
|
+
"post-%016x.xml" % @raw.hash
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
data/lib/abelard/web.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'sinatra/base'
|
3
|
+
require 'yaml'
|
4
|
+
require 'abelard/dir.rb'
|
5
|
+
require 'abelard/archive.rb'
|
6
|
+
|
7
|
+
CONFIG_FILE="blogfeeds.yaml"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
class FeedServer < Sinatra::Base
|
12
|
+
archive = Archive.new(CONFIG_FILE)
|
13
|
+
set :bind, "0.0.0.0"
|
14
|
+
|
15
|
+
get '/' do
|
16
|
+
template = <<ERB
|
17
|
+
<html><body><h2>Feeds</h2><dl>
|
18
|
+
<% archive.available.each do |blog| %>
|
19
|
+
<dt><%= blog %></dt>
|
20
|
+
<dd><%= archive.dir(blog).info["title"] %></dd>
|
21
|
+
<dd><a href="/<%= blog %>/posts">posts</a></dd>
|
22
|
+
<% end %>
|
23
|
+
</dl></body></html>
|
24
|
+
ERB
|
25
|
+
erb template, :locals => { :archive => archive }
|
26
|
+
end
|
27
|
+
|
28
|
+
get '/*/posts' do |blog|
|
29
|
+
headers "Content-Type" => "application/atom+xml"
|
30
|
+
archive.dir(blog).posts_feed.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
run!
|
34
|
+
end
|
35
|
+
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: abelard
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Anomaly UK
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-08-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rugged
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.23'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.23'
|
27
|
+
description: Persist blogs and similar web content as sharable git repositories
|
28
|
+
email: anomalyuk@tesco.net
|
29
|
+
executables:
|
30
|
+
- abelard
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- bin/abelard
|
35
|
+
- lib/abelard/archive.rb
|
36
|
+
- lib/abelard/dir.rb
|
37
|
+
- lib/abelard/dump.rb
|
38
|
+
- lib/abelard/history.rb
|
39
|
+
- lib/abelard/list.rb
|
40
|
+
- lib/abelard/load.rb
|
41
|
+
- lib/abelard/postxml.rb
|
42
|
+
- lib/abelard/web.rb
|
43
|
+
homepage: http://anomalyuk.blogspot.com/
|
44
|
+
licenses:
|
45
|
+
- GPL-2.0
|
46
|
+
metadata: {}
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 2.5.2
|
64
|
+
signing_key:
|
65
|
+
specification_version: 4
|
66
|
+
summary: Abelard blog archiver
|
67
|
+
test_files: []
|