abelard 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/abelard +12 -0
- data/lib/abelard/archive.rb +17 -0
- data/lib/abelard/dir.rb +166 -0
- data/lib/abelard/dump.rb +6 -0
- data/lib/abelard/history.rb +142 -0
- data/lib/abelard/list.rb +31 -0
- data/lib/abelard/load.rb +360 -0
- data/lib/abelard/postxml.rb +80 -0
- data/lib/abelard/web.rb +35 -0
- metadata +67 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 11d17040bfbcb446c38d93ce3ddaf614a55fb7c1
|
4
|
+
data.tar.gz: b40187d7a1c261912e3e2655e7783a6e4b9915c7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cb41b5744e105884515e1e8e8513bc6601c132a4b332e7981f6fe038df48b7ff7cb797bac6f3ac6c9a5d20d62656c1c46d7f50c361dce387b341591265086edc
|
7
|
+
data.tar.gz: be36ec1cb0beb93e410226221bbe4faacb95aed8f156aec17b2be2b8c6f4803939caa939ef01800463f1119da6c296ceafc5bedc17cbd7165e9c6074375b37e0
|
data/bin/abelard
ADDED
data/lib/abelard/dir.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
require 'time'
|
3
|
+
require 'abelard/history'
|
4
|
+
|
5
|
+
# known namespaces for xpath search
|
6
|
+
NS = [
|
7
|
+
"atom:http://www.w3.org/2005/Atom",
|
8
|
+
"dc:http://purl.org/dc/elements/1.1/",
|
9
|
+
"app:http://purl.org/atom/app#",
|
10
|
+
"wp:http://wordpress.org/export/1.2/"
|
11
|
+
]
|
12
|
+
|
13
|
+
class Item
|
14
|
+
attr_accessor :timestamp, :title, :file, :doc, :author, :status
|
15
|
+
def initialize(xml, filename)
|
16
|
+
@doc = xml
|
17
|
+
@file = filename
|
18
|
+
timestamp_node = doc.find_first("/atom:entry/atom:published", NS) ||
|
19
|
+
doc.find_first("/item/pubDate")
|
20
|
+
if timestamp_node
|
21
|
+
@timestamp = Time.parse(timestamp_node.content)
|
22
|
+
else
|
23
|
+
@timestamp = Time.new(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
title_node = doc.find_first("/atom:entry/atom:title", NS) ||
|
27
|
+
doc.find_first("/item/title")
|
28
|
+
if title_node
|
29
|
+
@title = title_node.content
|
30
|
+
else
|
31
|
+
@title = "Post"
|
32
|
+
end
|
33
|
+
|
34
|
+
author_node = doc.find_first("/atom:entry/atom:author/atom:name", NS) ||
|
35
|
+
doc.find_first("/item/dc:creator", NS)
|
36
|
+
if author_node
|
37
|
+
@author = author_node.content
|
38
|
+
else
|
39
|
+
@author = 'abelard'
|
40
|
+
end
|
41
|
+
|
42
|
+
@status = :published
|
43
|
+
status_node = doc.find_first("/item/wp:status", NS)
|
44
|
+
if status_node
|
45
|
+
$stderr.puts("raw status #{status_node.content}")
|
46
|
+
if status_node.content == "trash"
|
47
|
+
@status = :trash
|
48
|
+
elsif status_node.content == "draft"
|
49
|
+
@status = :draft
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
draft_node = doc.find_first("/atom:entry/app:control/app:draft", NS)
|
54
|
+
if draft_node
|
55
|
+
if draft_node.content == "yes"
|
56
|
+
@status = :draft
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def save
|
62
|
+
puts("writing #{file}")
|
63
|
+
doc.save(file, :indent => true, :encoding => LibXML::XML::Encoding::UTF_8)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
class Directory
|
68
|
+
def initialize(path)
|
69
|
+
@path = path
|
70
|
+
@base_doc = read_base_doc
|
71
|
+
@feed_type = case @base_doc.root.name
|
72
|
+
when "feed"
|
73
|
+
:atom
|
74
|
+
when "rss"
|
75
|
+
:rss
|
76
|
+
else
|
77
|
+
:unknown
|
78
|
+
end
|
79
|
+
|
80
|
+
@git = History.new(self, path)
|
81
|
+
end
|
82
|
+
|
83
|
+
def save
|
84
|
+
@git.commit_posts
|
85
|
+
end
|
86
|
+
|
87
|
+
def read_base_doc
|
88
|
+
feed = LibXML::XML::Parser.file("#{@path}/feed.xml").parse
|
89
|
+
if feed.root.name == "rss"
|
90
|
+
LibXML::XML::Parser.file("#{@path}/channel-1.xml").parse
|
91
|
+
else
|
92
|
+
feed
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def base_doc
|
97
|
+
if ! @base_doc
|
98
|
+
@base_doc = read_base_doc
|
99
|
+
end
|
100
|
+
@base_doc
|
101
|
+
end
|
102
|
+
|
103
|
+
# iterates the Item objects for the feed, in order
|
104
|
+
def each
|
105
|
+
by_date = {}
|
106
|
+
each_unsorted do |post,filename|
|
107
|
+
item = Item.new(post,filename)
|
108
|
+
by_date[item.timestamp] = item
|
109
|
+
end
|
110
|
+
by_date.keys.sort.map { |dt| yield by_date[dt] }
|
111
|
+
end
|
112
|
+
|
113
|
+
def info
|
114
|
+
inf = {}
|
115
|
+
el = base_doc.find_first("/atom:feed/atom:title", NS) ||
|
116
|
+
base_doc.find_first("/rss/channel/title")
|
117
|
+
inf["title"] = el.content
|
118
|
+
inf
|
119
|
+
end
|
120
|
+
|
121
|
+
def posts_feed
|
122
|
+
feed = read_base_doc
|
123
|
+
case @feed_type
|
124
|
+
when :atom
|
125
|
+
posts_feed_atom(feed)
|
126
|
+
when :rss
|
127
|
+
posts_feed_rss(feed)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def insert_posts(collection)
|
132
|
+
each do |post|
|
133
|
+
$stderr.puts "adding #{post.file}"
|
134
|
+
collection << collection.doc.import(post.doc.root)
|
135
|
+
end
|
136
|
+
collection
|
137
|
+
end
|
138
|
+
|
139
|
+
def each_unsorted
|
140
|
+
Dir.glob("#{@path}/post-*.xml") do |filename|
|
141
|
+
post = LibXML::XML::Parser.file(filename).parse
|
142
|
+
yield post, filename
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def posts_feed_atom(doc)
|
147
|
+
insert_posts(doc.root)
|
148
|
+
doc
|
149
|
+
end
|
150
|
+
|
151
|
+
def posts_feed_rss(rssdoc)
|
152
|
+
doc = LibXML::XML::Parser.file("#{@path}/channel-1.xml").parse
|
153
|
+
channel = doc.find_first("/rss/channel");
|
154
|
+
insert_posts(channel)
|
155
|
+
doc
|
156
|
+
end
|
157
|
+
|
158
|
+
def sort_entries(repo_entries)
|
159
|
+
by_date = repo_entries.map do |e|
|
160
|
+
{ :entry => e,
|
161
|
+
:time => Item.new(LibXML::XML::Parser.file(e.path).parse, e.path ).timestamp }
|
162
|
+
end
|
163
|
+
by_date.sort! { |a,b| a[:time] <=> b[:time] }
|
164
|
+
by_date.map { |hash| hash[:entry] }
|
165
|
+
end
|
166
|
+
end
|
data/lib/abelard/dump.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
# Manage a git repository representing the feed
|
2
|
+
|
3
|
+
require 'rugged'
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
class History
|
7
|
+
# archive is a Directory, dir is a path to store in git
|
8
|
+
def initialize(archive, dir)
|
9
|
+
@archive = archive
|
10
|
+
if File.directory? dir
|
11
|
+
begin
|
12
|
+
repo = Rugged::Repository.discover(dir)
|
13
|
+
|
14
|
+
repo_base = Pathname.new(repo.workdir).realpath.to_s
|
15
|
+
real_dir = Pathname.new(dir).realpath.to_s
|
16
|
+
raise "confused! #{repo_base} #{real_dir}" unless real_dir.start_with?(repo_base)
|
17
|
+
@relative_root = real_dir[repo_base.length+1..-1] || ""
|
18
|
+
$stderr.puts "#{real_dir} in #{repo_base} : #{@relative_root}"
|
19
|
+
|
20
|
+
check_repo_clean(repo, @relative_root)
|
21
|
+
rescue Rugged::RepositoryError
|
22
|
+
repo = Rugged::Repository.init_at(dir)
|
23
|
+
@relative_root = ""
|
24
|
+
end
|
25
|
+
elsif File.exist? dir
|
26
|
+
fail "#{dir} exists as file"
|
27
|
+
else
|
28
|
+
Dir.mkdir(dir)
|
29
|
+
repo = Rugged::Repository.init_at(dir)
|
30
|
+
@relative_root = ""
|
31
|
+
end
|
32
|
+
@repo = repo
|
33
|
+
@dir_path = dir
|
34
|
+
end
|
35
|
+
|
36
|
+
class Entry
|
37
|
+
# dir_fn is the path to the file relative to the feed directory
|
38
|
+
# git_fn is the path to the file relative to the git root
|
39
|
+
# path is the full path
|
40
|
+
|
41
|
+
attr_reader :git_fn, :dir_fn, :path
|
42
|
+
def initialize(f, root, repository)
|
43
|
+
@git_fn = f
|
44
|
+
|
45
|
+
@dir_fn = if root.empty?
|
46
|
+
f
|
47
|
+
else
|
48
|
+
f[root.length+1..-1]
|
49
|
+
end
|
50
|
+
|
51
|
+
@path = repository.workdir + '/' + @dir_fn
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def entry(from_git)
|
56
|
+
Entry.new(from_git, @relative_root, @repo)
|
57
|
+
end
|
58
|
+
|
59
|
+
def check_repo_clean(repo, sub)
|
60
|
+
$stderr.puts "check_repo_clean(#{repo},#{sub})"
|
61
|
+
clean = true
|
62
|
+
repo.status do |file, data|
|
63
|
+
change = classify_file(sub, file)
|
64
|
+
clean = false if change == :real
|
65
|
+
end
|
66
|
+
clean
|
67
|
+
end
|
68
|
+
|
69
|
+
def commit_posts
|
70
|
+
repo = @repo
|
71
|
+
sub = @relative_root
|
72
|
+
|
73
|
+
commits = 0
|
74
|
+
todo = { real: [] }
|
75
|
+
@repo.status do |file, data|
|
76
|
+
change = classify_file(sub, file)
|
77
|
+
todo[change] ||= []
|
78
|
+
todo[change] << file
|
79
|
+
end
|
80
|
+
if todo[:top]
|
81
|
+
todo[:top].each do |file|
|
82
|
+
repo.index.add file
|
83
|
+
repo_entry = entry(file)
|
84
|
+
item = Item.new(LibXML::XML::Parser.file(repo_entry.path).parse, repo_entry.path)
|
85
|
+
|
86
|
+
author = {:email => "#{item.author}@example.org",
|
87
|
+
:time => item.timestamp,
|
88
|
+
:name => item.author}
|
89
|
+
parents = []
|
90
|
+
parents << repo.head.target unless repo.head_unborn?
|
91
|
+
commit = Rugged::Commit.create(repo,
|
92
|
+
:author => author,
|
93
|
+
:message => "feed info",
|
94
|
+
:committer => author,
|
95
|
+
:parents => parents,
|
96
|
+
:tree => repo.index.write_tree(repo),
|
97
|
+
:update_ref => "HEAD")
|
98
|
+
commits = commits+1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
to_commit = @archive.sort_entries(todo[:real].map { |f| entry(f) })
|
103
|
+
|
104
|
+
to_commit.each do |entry|
|
105
|
+
file = entry.git_fn
|
106
|
+
repo.index.add file
|
107
|
+
repo_entry = entry(file)
|
108
|
+
item = Item.new(LibXML::XML::Parser.file(repo_entry.path).parse, repo_entry.path)
|
109
|
+
|
110
|
+
author = {:email => "#{item.author}@example.org",
|
111
|
+
:time => item.timestamp,
|
112
|
+
:name => item.author}
|
113
|
+
|
114
|
+
$stderr.puts "Adding #{file}"
|
115
|
+
|
116
|
+
commit = Rugged::Commit.create(repo,
|
117
|
+
:author => author,
|
118
|
+
:message => "post",
|
119
|
+
:committer => author,
|
120
|
+
:parents => [repo.head.target],
|
121
|
+
:tree => repo.index.write_tree(repo),
|
122
|
+
:update_ref => "HEAD")
|
123
|
+
commits = commits+1
|
124
|
+
end
|
125
|
+
|
126
|
+
repo.index.write if commits > 0
|
127
|
+
end
|
128
|
+
|
129
|
+
def classify_file(subdir, file)
|
130
|
+
# normally 1 archive = 1 repo, but if you have a repo of several
|
131
|
+
# archives, ignore file changes outside
|
132
|
+
return :outside unless file.start_with?(subdir)
|
133
|
+
|
134
|
+
filename = Pathname.new(file).basename.to_s
|
135
|
+
|
136
|
+
return :real if filename.start_with?("post-") or filename.start_with?("comment-")
|
137
|
+
return :top if filename.start_with?("feed") or filename.start_with?("channel")
|
138
|
+
|
139
|
+
return :unknown
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
data/lib/abelard/list.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'abelard/dir.rb'
|
5
|
+
CONFIG_FILE = "blogfeeds.yaml"
|
6
|
+
|
7
|
+
dest=''
|
8
|
+
if ARGV.length > 0 then
|
9
|
+
if (ARGV[0] == '-h') then
|
10
|
+
$stderr.puts("abelard list -d <dir>\nabelard list <config-entry>
|
11
|
+
abelard list\n")
|
12
|
+
else
|
13
|
+
if (ARGV[0] == '-d') then
|
14
|
+
dest = ARGV[1]
|
15
|
+
else
|
16
|
+
configs = YAML.load_file(CONFIG_FILE)
|
17
|
+
conf = configs[ARGV[0]]
|
18
|
+
dest = conf['dest']
|
19
|
+
end
|
20
|
+
dir = Directory.new(dest)
|
21
|
+
dir.each do |item|
|
22
|
+
printf("%s %s\n", item.timestamp.strftime("%Y-%m-%d"), item.title)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
else
|
26
|
+
configs = YAML.load_file(CONFIG_FILE)
|
27
|
+
configs.each do |name, conf|
|
28
|
+
puts("#{name}: #{conf['urls'].first}")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
data/lib/abelard/load.rb
ADDED
@@ -0,0 +1,360 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'libxml'
|
3
|
+
require 'net/http'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
require 'abelard/dir.rb'
|
7
|
+
require 'abelard/postxml.rb'
|
8
|
+
|
9
|
+
CONFIG_FILE = "blogfeeds.yaml"
|
10
|
+
|
11
|
+
Debug = true
|
12
|
+
|
13
|
+
|
14
|
+
module Splitter
|
15
|
+
def item(xmlnode, filename)
|
16
|
+
filedoc = LibXML::XML::Document.new()
|
17
|
+
filedoc.root = xmlnode.copy(true)
|
18
|
+
item = Item.new(filedoc, filename)
|
19
|
+
end
|
20
|
+
|
21
|
+
# deprecated
|
22
|
+
def write_item(xmlnode, file)
|
23
|
+
filedoc = LibXML::XML::Document.new()
|
24
|
+
filedoc.root = xmlnode.copy(true)
|
25
|
+
puts("writing #{file}")
|
26
|
+
filedoc.save(file, :indent => true, :encoding => LibXML::XML::Encoding::UTF_8)
|
27
|
+
end
|
28
|
+
|
29
|
+
# stream the document to a string and reparse it to clean up redundant namespaces
|
30
|
+
def write_doc_clean(doc, file)
|
31
|
+
cleandoc = LibXML::XML::Parser.string(doc.to_s, :options => LibXML::XML::Parser::Options::NSCLEAN).parse
|
32
|
+
cleandoc.save(file)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class Splitter_rss
|
37
|
+
include Splitter
|
38
|
+
NS = ['wp:http://wordpress.org/export/1.2/'];
|
39
|
+
|
40
|
+
def initialize(document, destination)
|
41
|
+
@doc = document
|
42
|
+
@dest = destination
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_comments(item)
|
46
|
+
# In a wordpress export, the comments are in wp:comment elements
|
47
|
+
basename = Post_id_rss.new(item)
|
48
|
+
|
49
|
+
all = []
|
50
|
+
comment_nodes = item.find("wp:comment", NS)
|
51
|
+
comment_nodes.each do |node|
|
52
|
+
comment_doc = LibXML::XML::Document.new()
|
53
|
+
comment_doc.root = node.remove!
|
54
|
+
approved = comment_doc.find_first("/wp:comment/wp:comment_approved", NS)
|
55
|
+
author_email = comment_doc.find_first("/wp:comment/wp:comment_author_email", NS)
|
56
|
+
author_ip = comment_doc.find_first("/wp:comment/wp:comment_author_IP", NS)
|
57
|
+
id = comment_doc.find_first("/wp:comment/wp:comment_id", NS)
|
58
|
+
|
59
|
+
# delete some sensitive fields
|
60
|
+
author_email.remove! if (author_email)
|
61
|
+
author_ip.remove! if (author_ip)
|
62
|
+
|
63
|
+
if (approved && (approved.content == '1'))
|
64
|
+
filename = basename.as_comment(id.content)
|
65
|
+
|
66
|
+
all << Item.new(comment_doc, "#{@dest}/#{filename}")
|
67
|
+
end
|
68
|
+
end
|
69
|
+
all
|
70
|
+
end
|
71
|
+
|
72
|
+
def split_items
|
73
|
+
channel_count = 1
|
74
|
+
rss = @doc.root
|
75
|
+
@parent = LibXML::XML::Document.new()
|
76
|
+
root = LibXML::XML::Node.new(rss.name)
|
77
|
+
@parent.root = root
|
78
|
+
rss.attributes.each { |a| root.attributes[a.name] = a.value }
|
79
|
+
rss.children.select(&:element?).each do |channel|
|
80
|
+
if (channel.name == "channel")
|
81
|
+
root << channel.clone # shallow copy for feed.xml
|
82
|
+
|
83
|
+
channelself = XmlUtil::self_link(channel)
|
84
|
+
is_comments = (channelself =~ /comments/)
|
85
|
+
|
86
|
+
copy = LibXML::XML::Node.new(channel.name)
|
87
|
+
channel.attributes.each { |a| copy.attributes[a.name] = a.value }
|
88
|
+
channel.children.select(&:element?).each do |node|
|
89
|
+
$stderr.puts(node.name)
|
90
|
+
if (node.name == "item")
|
91
|
+
# attachments dont get saved as posts
|
92
|
+
if ( node.find("wp:attachment_url", "wp:http://wordpress.org/export/1.2/").length > 0 )
|
93
|
+
$stderr.puts("skipping attachment")
|
94
|
+
else
|
95
|
+
# in a wordpress export file, comments are included inside the post item
|
96
|
+
comments = extract_comments(node)
|
97
|
+
save(node)
|
98
|
+
comments.each { |c| c.save }
|
99
|
+
end
|
100
|
+
else
|
101
|
+
copy << node.copy(true)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
ch_copy = root.copy(true)
|
105
|
+
ch_copy << copy
|
106
|
+
unless is_comments
|
107
|
+
channel_doc = LibXML::XML::Document.new()
|
108
|
+
channel_doc.root = ch_copy
|
109
|
+
channel_doc.save("#{@dest}/channel-#{channel_count}.xml")
|
110
|
+
end
|
111
|
+
channel_count = channel_count + 1
|
112
|
+
else
|
113
|
+
root << channel
|
114
|
+
end
|
115
|
+
end
|
116
|
+
@parent.save("#{@dest}/feed.xml")
|
117
|
+
end
|
118
|
+
|
119
|
+
def save(node)
|
120
|
+
filename = Post_id_rss.new(node).to_s
|
121
|
+
new_item = item(node, "#{@dest}/#{filename}")
|
122
|
+
if new_item.status == :published
|
123
|
+
new_item.save
|
124
|
+
else
|
125
|
+
$stderr.puts("skipping #{filename} as status #{new_item.status}")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
class Splitter_atom
|
131
|
+
include Splitter
|
132
|
+
def initialize(document, destination)
|
133
|
+
@doc = document
|
134
|
+
@dest = destination
|
135
|
+
end
|
136
|
+
|
137
|
+
def split_items
|
138
|
+
feed = @doc.root
|
139
|
+
|
140
|
+
feedself = XmlUtil::self_link(feed)
|
141
|
+
|
142
|
+
@feed_type = nil # unknown
|
143
|
+
@feed_type = "post" if (feedself =~ %r{/posts/default$})
|
144
|
+
@feed_type = "comment" if (feedself =~ %r{/comments/default$})
|
145
|
+
|
146
|
+
@parent = LibXML::XML::Document.new()
|
147
|
+
root = LibXML::XML::Node.new(feed.name)
|
148
|
+
@parent.root = root
|
149
|
+
feed.namespaces.definitions.each {|ns| LibXML::XML::Namespace.new(root, ns.prefix, ns.href)}
|
150
|
+
feed.attributes.each { |a| root.attributes[a.name] = a.value }
|
151
|
+
|
152
|
+
feed.children.select(&:element?).each do |node|
|
153
|
+
if (node.name == "entry")
|
154
|
+
save(node)
|
155
|
+
else
|
156
|
+
root << @parent.import(node)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
write_doc_clean(@parent, "#{@dest}/feed.xml")
|
161
|
+
end
|
162
|
+
|
163
|
+
def save(node)
|
164
|
+
id = node.children.find { |n| n.name == "id" }
|
165
|
+
id = id && id.content
|
166
|
+
|
167
|
+
path = XmlUtil::self_link(node)
|
168
|
+
|
169
|
+
case node.name
|
170
|
+
when "entry"
|
171
|
+
category = XmlUtil::child_attribute(node, "category", "term")
|
172
|
+
|
173
|
+
if @feed_type
|
174
|
+
entry_type = @feed_type
|
175
|
+
else
|
176
|
+
entry_type = category.split('#').last if category
|
177
|
+
end
|
178
|
+
|
179
|
+
case entry_type
|
180
|
+
when "post"
|
181
|
+
postnumber = path.split('/').last
|
182
|
+
filename = "#{@dest}/post-#{postnumber}.xml"
|
183
|
+
write_item(node, filename)
|
184
|
+
when "comment"
|
185
|
+
pathsplit = path.split('/')
|
186
|
+
postnumber = pathsplit[-4]
|
187
|
+
commentnumber = pathsplit[-1]
|
188
|
+
filename = "#{@dest}/comment-#{postnumber}-#{commentnumber}.xml"
|
189
|
+
write_item(node,filename)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
class SourceBase
|
196
|
+
def process(parser, destination)
|
197
|
+
doc = parser.parse
|
198
|
+
|
199
|
+
case doc.root.name
|
200
|
+
when "feed"
|
201
|
+
atom = Splitter_atom.new(doc, destination)
|
202
|
+
atom.split_items
|
203
|
+
when "rss"
|
204
|
+
rss = Splitter_rss.new(doc, destination)
|
205
|
+
rss.split_items
|
206
|
+
else
|
207
|
+
puts "don't know what to do with element #{doc.root.name}"
|
208
|
+
end
|
209
|
+
|
210
|
+
archive = Directory.new(destination)
|
211
|
+
archive.save
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
class FileSource < SourceBase
|
216
|
+
def initialize(filename, dest)
|
217
|
+
@file = filename
|
218
|
+
@dest = dest
|
219
|
+
end
|
220
|
+
|
221
|
+
def load
|
222
|
+
ensure_dest(@dest)
|
223
|
+
parser = LibXML::XML::Parser.file(@file)
|
224
|
+
process(parser, @dest)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
class Source < SourceBase
|
229
|
+
def initialize(conf_in)
|
230
|
+
if conf_in.respond_to?(:keys)
|
231
|
+
@conf = conf_in
|
232
|
+
else
|
233
|
+
@conf = get_config(conf_in) || die("No config for #{conf_in}")
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def load
|
238
|
+
conf = @conf
|
239
|
+
dest = conf["dest"] || die("No 'dest' directory defined")
|
240
|
+
urls = conf["urls"] || die("No urls defined")
|
241
|
+
|
242
|
+
ensure_dest(dest)
|
243
|
+
|
244
|
+
fetcher = Fetcher.new
|
245
|
+
fetcher.user = conf["user"]
|
246
|
+
fetcher.password = conf["password"]
|
247
|
+
|
248
|
+
urls.each do |urlpath|
|
249
|
+
url = URI(urlpath)
|
250
|
+
fetcher.get(url) do |response|
|
251
|
+
write_raw(response.body, "#{dest}/raw.xml") if Debug
|
252
|
+
parser = LibXML::XML::Parser.string(response.body)
|
253
|
+
process(parser, dest)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def all_configs
|
259
|
+
YAML.load_file(CONFIG_FILE) || {}
|
260
|
+
end
|
261
|
+
|
262
|
+
def get_config(key)
|
263
|
+
configuration_file = all_configs
|
264
|
+
configuration_file[key]
|
265
|
+
end
|
266
|
+
|
267
|
+
def save_config(key, conf)
|
268
|
+
configuration_data = all_configs
|
269
|
+
if configuration_data.has_key? key
|
270
|
+
die("Already have config for #{key}")
|
271
|
+
else
|
272
|
+
configuration_data[key] = conf
|
273
|
+
open(CONFIG_FILE, 'w+') do |conf_file|
|
274
|
+
conf_file.puts(configuration_data.to_yaml)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def write_raw(data, filename)
|
280
|
+
File.open(filename, "w") { |f| f.write(data) }
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def die(error)
|
285
|
+
puts error
|
286
|
+
exit 1
|
287
|
+
end
|
288
|
+
|
289
|
+
# Wrap an HTTP session, handle making a request and
|
290
|
+
# following any redirects
|
291
|
+
class Fetcher
|
292
|
+
def initialize
|
293
|
+
@host = nil
|
294
|
+
@session = nil
|
295
|
+
@user = nil
|
296
|
+
@password = nil
|
297
|
+
end
|
298
|
+
attr_accessor :user, :password
|
299
|
+
|
300
|
+
def get(url, max_depth=3)
|
301
|
+
if (url.host != @host)
|
302
|
+
@host = url.host
|
303
|
+
@session = Net::HTTP.new(@host, url.port)
|
304
|
+
@session.use_ssl = (url.scheme == 'https')
|
305
|
+
end
|
306
|
+
request = Net::HTTP::Get.new(url)
|
307
|
+
request.basic_auth(user, password) if user
|
308
|
+
|
309
|
+
msg = "Reading (#{url.to_s})"
|
310
|
+
msg << " as #{user}" if user
|
311
|
+
$stderr.puts(msg)
|
312
|
+
|
313
|
+
feedxml = @session.request(request)
|
314
|
+
if (feedxml.is_a? Net::HTTPOK)
|
315
|
+
yield feedxml
|
316
|
+
elsif (feedxml.is_a? Net::HTTPMovedPermanently )
|
317
|
+
new_url = feedxml['Location']
|
318
|
+
if ( new_url == url.to_s ) then
|
319
|
+
puts("Confused! redirect to same url #{new_url}")
|
320
|
+
else
|
321
|
+
if ( max_depth == 0 )
|
322
|
+
puts("Too many redirects")
|
323
|
+
else
|
324
|
+
puts("Redirecting to #{new_url}")
|
325
|
+
get(URI(new_url), max_depth-1) { |r| yield r }
|
326
|
+
end
|
327
|
+
end
|
328
|
+
else
|
329
|
+
puts("GET returned #{feedxml.code}")
|
330
|
+
puts(feedxml.body)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
def ensure_dest(dest)
|
336
|
+
Dir::mkdir(dest) unless File.directory?(dest)
|
337
|
+
unless File.directory?(dest)
|
338
|
+
$stderr.puts "Could not create directory #{dest}"
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
if ARGV.empty?
|
343
|
+
$stderr.puts "Syntax:\nabelard load -f <filename> <destination>\nabelard load {-n <url>} ... <destination>\nabelard load <config>"
|
344
|
+
exit(1)
|
345
|
+
elsif ARGV[0] == "-f"
|
346
|
+
source = FileSource.new(ARGV[1], ARGV[2])
|
347
|
+
elsif ARGV[0] == '-n'
|
348
|
+
urls = []
|
349
|
+
while ARGV[0] == '-n'
|
350
|
+
ARGV.shift
|
351
|
+
urls << ARGV.shift
|
352
|
+
end
|
353
|
+
conf = {"urls" => urls, "dest" => ARGV[0]}
|
354
|
+
source = Source.new(conf)
|
355
|
+
source.save_config(conf["dest"],conf)
|
356
|
+
else
|
357
|
+
key = ARGV[0]
|
358
|
+
source = Source.new(key)
|
359
|
+
end
|
360
|
+
source.load
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module XmlUtil
|
4
|
+
def self.child_content(node, elementname)
|
5
|
+
el = node.children.find { |n| n.name == elementname }
|
6
|
+
el && el.content
|
7
|
+
end
|
8
|
+
def self.child_attribute(node, elementname, attributename)
|
9
|
+
el = node.children.find { |n| n.name == elementname }
|
10
|
+
attr = el && el.attributes.get_attribute("term")
|
11
|
+
attr && attr.value
|
12
|
+
end
|
13
|
+
def self.with_attribute(node, attributename, attributevalue)
|
14
|
+
a = node.attributes.get_attribute(attributename)
|
15
|
+
a && (a.value == attributevalue)
|
16
|
+
end
|
17
|
+
def self.self_link(node)
|
18
|
+
el = node.children.find { |l| (l.name == "link") && with_attribute(l, "rel", "self") }
|
19
|
+
el && el.attributes.get_attribute("href").value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
class Post_id_rss
|
25
|
+
attr_reader :idurl
|
26
|
+
def initialize(postxml)
|
27
|
+
#XmlUtil::child_content(postxml, "post_id") ||
|
28
|
+
@idurl = XmlUtil::child_content(postxml, "guid")
|
29
|
+
@raw = postxml.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_s
|
33
|
+
if !idurl
|
34
|
+
improvise
|
35
|
+
else
|
36
|
+
postnumber = post_match
|
37
|
+
commenturl = /\?p(age_id)?=(\d+)(\.xml)?#comment-(.*)$/.match(idurl) ||
|
38
|
+
/^(.*)\/(\d{4}\/.*)\/#(comment)-(.*)$/.match(idurl)
|
39
|
+
|
40
|
+
if commenturl
|
41
|
+
postnumber = commenturl[2].sub(/^\//,'').sub(/\.xml$/,'').gsub('/','-')
|
42
|
+
commentnumber = commenturl[4]
|
43
|
+
"comment-#{postnumber}-#{commentnumber}.xml"
|
44
|
+
else
|
45
|
+
"post-#{postnumber}.xml"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def post_match
|
51
|
+
posturl = /\?p(age_id)?=(\d+)(\.xml)?$/.match(idurl)
|
52
|
+
if posturl
|
53
|
+
posturl[2]
|
54
|
+
else
|
55
|
+
sanitize
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def as_comment(commentnumber)
|
60
|
+
"comment-#{post_match}-#{commentnumber}.xml"
|
61
|
+
end
|
62
|
+
|
63
|
+
def sanitize
|
64
|
+
uri = URI(idurl)
|
65
|
+
$stderr.puts("Could not parse url #{idurl}") unless ( uri )
|
66
|
+
if ( uri.scheme == "tag" )
|
67
|
+
return idurl.split('-').last
|
68
|
+
end
|
69
|
+
|
70
|
+
build = uri.path.sub(/^\//,'').sub(/\.xml$/,'').gsub('/','-')
|
71
|
+
build.concat('-' + uri.query.gsub(/[?&]/,'-')) if uri.query
|
72
|
+
build.concat('-' + uri.fragment) if uri.fragment
|
73
|
+
build
|
74
|
+
end
|
75
|
+
|
76
|
+
def improvise
|
77
|
+
"post-%016x.xml" % @raw.hash
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
data/lib/abelard/web.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'sinatra/base'
|
3
|
+
require 'yaml'
|
4
|
+
require 'abelard/dir.rb'
|
5
|
+
require 'abelard/archive.rb'
|
6
|
+
|
7
|
+
CONFIG_FILE="blogfeeds.yaml"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
class FeedServer < Sinatra::Base
|
12
|
+
archive = Archive.new(CONFIG_FILE)
|
13
|
+
set :bind, "0.0.0.0"
|
14
|
+
|
15
|
+
get '/' do
|
16
|
+
template = <<ERB
|
17
|
+
<html><body><h2>Feeds</h2><dl>
|
18
|
+
<% archive.available.each do |blog| %>
|
19
|
+
<dt><%= blog %></dt>
|
20
|
+
<dd><%= archive.dir(blog).info["title"] %></dd>
|
21
|
+
<dd><a href="/<%= blog %>/posts">posts</a></dd>
|
22
|
+
<% end %>
|
23
|
+
</dl></body></html>
|
24
|
+
ERB
|
25
|
+
erb template, :locals => { :archive => archive }
|
26
|
+
end
|
27
|
+
|
28
|
+
get '/*/posts' do |blog|
|
29
|
+
headers "Content-Type" => "application/atom+xml"
|
30
|
+
archive.dir(blog).posts_feed.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
run!
|
34
|
+
end
|
35
|
+
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: abelard
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Anomaly UK
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-08-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rugged
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.23'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.23'
|
27
|
+
description: Persist blogs and similar web content as sharable git repositories
|
28
|
+
email: anomalyuk@tesco.net
|
29
|
+
executables:
|
30
|
+
- abelard
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- bin/abelard
|
35
|
+
- lib/abelard/archive.rb
|
36
|
+
- lib/abelard/dir.rb
|
37
|
+
- lib/abelard/dump.rb
|
38
|
+
- lib/abelard/history.rb
|
39
|
+
- lib/abelard/list.rb
|
40
|
+
- lib/abelard/load.rb
|
41
|
+
- lib/abelard/postxml.rb
|
42
|
+
- lib/abelard/web.rb
|
43
|
+
homepage: http://anomalyuk.blogspot.com/
|
44
|
+
licenses:
|
45
|
+
- GPL-2.0
|
46
|
+
metadata: {}
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 2.5.2
|
64
|
+
signing_key:
|
65
|
+
specification_version: 4
|
66
|
+
summary: Abelard blog archiver
|
67
|
+
test_files: []
|