abelard 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 11d17040bfbcb446c38d93ce3ddaf614a55fb7c1
4
+ data.tar.gz: b40187d7a1c261912e3e2655e7783a6e4b9915c7
5
+ SHA512:
6
+ metadata.gz: cb41b5744e105884515e1e8e8513bc6601c132a4b332e7981f6fe038df48b7ff7cb797bac6f3ac6c9a5d20d62656c1c46d7f50c361dce387b341591265086edc
7
+ data.tar.gz: be36ec1cb0beb93e410226221bbe4faacb95aed8f156aec17b2be2b8c6f4803939caa939ef01800463f1119da6c296ceafc5bedc17cbd7165e9c6074375b37e0
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ cmd = ARGV.shift
4
+
5
+ known = %w( load dump list web )
6
+
7
+ if ( known.include? cmd )
8
+ require "abelard/#{cmd}"
9
+ else
10
+ $stderr.puts "Unknown command #{cmd}"
11
+ $stderr.puts "Use one of #{known}"
12
+ end
@@ -0,0 +1,17 @@
1
+ require 'yaml'
2
+
3
+ class Archive
4
+ def initialize(file)
5
+ @configfile = file
6
+ @config = YAML.load_file(file)
7
+ end
8
+
9
+ def dir(blog)
10
+ Directory.new(@config[blog]["dest"])
11
+ end
12
+
13
+ def available
14
+ @config.keys
15
+ end
16
+
17
+ end
@@ -0,0 +1,166 @@
1
+ require 'libxml'
2
+ require 'time'
3
+ require 'abelard/history'
4
+
5
+ # known namespaces for xpath search
6
+ NS = [
7
+ "atom:http://www.w3.org/2005/Atom",
8
+ "dc:http://purl.org/dc/elements/1.1/",
9
+ "app:http://purl.org/atom/app#",
10
+ "wp:http://wordpress.org/export/1.2/"
11
+ ]
12
+
13
+ class Item
14
+ attr_accessor :timestamp, :title, :file, :doc, :author, :status
15
+ def initialize(xml, filename)
16
+ @doc = xml
17
+ @file = filename
18
+ timestamp_node = doc.find_first("/atom:entry/atom:published", NS) ||
19
+ doc.find_first("/item/pubDate")
20
+ if timestamp_node
21
+ @timestamp = Time.parse(timestamp_node.content)
22
+ else
23
+ @timestamp = Time.new(0)
24
+ end
25
+
26
+ title_node = doc.find_first("/atom:entry/atom:title", NS) ||
27
+ doc.find_first("/item/title")
28
+ if title_node
29
+ @title = title_node.content
30
+ else
31
+ @title = "Post"
32
+ end
33
+
34
+ author_node = doc.find_first("/atom:entry/atom:author/atom:name", NS) ||
35
+ doc.find_first("/item/dc:creator", NS)
36
+ if author_node
37
+ @author = author_node.content
38
+ else
39
+ @author = 'abelard'
40
+ end
41
+
42
+ @status = :published
43
+ status_node = doc.find_first("/item/wp:status", NS)
44
+ if status_node
45
+ $stderr.puts("raw status #{status_node.content}")
46
+ if status_node.content == "trash"
47
+ @status = :trash
48
+ elsif status_node.content == "draft"
49
+ @status = :draft
50
+ end
51
+ end
52
+
53
+ draft_node = doc.find_first("/atom:entry/app:control/app:draft", NS)
54
+ if draft_node
55
+ if draft_node.content == "yes"
56
+ @status = :draft
57
+ end
58
+ end
59
+ end
60
+
61
+ def save
62
+ puts("writing #{file}")
63
+ doc.save(file, :indent => true, :encoding => LibXML::XML::Encoding::UTF_8)
64
+ end
65
+ end
66
+
67
+ class Directory
68
+ def initialize(path)
69
+ @path = path
70
+ @base_doc = read_base_doc
71
+ @feed_type = case @base_doc.root.name
72
+ when "feed"
73
+ :atom
74
+ when "rss"
75
+ :rss
76
+ else
77
+ :unknown
78
+ end
79
+
80
+ @git = History.new(self, path)
81
+ end
82
+
83
+ def save
84
+ @git.commit_posts
85
+ end
86
+
87
+ def read_base_doc
88
+ feed = LibXML::XML::Parser.file("#{@path}/feed.xml").parse
89
+ if feed.root.name == "rss"
90
+ LibXML::XML::Parser.file("#{@path}/channel-1.xml").parse
91
+ else
92
+ feed
93
+ end
94
+ end
95
+
96
+ def base_doc
97
+ if ! @base_doc
98
+ @base_doc = read_base_doc
99
+ end
100
+ @base_doc
101
+ end
102
+
103
+ # iterates the Item objects for the feed, in order
104
+ def each
105
+ by_date = {}
106
+ each_unsorted do |post,filename|
107
+ item = Item.new(post,filename)
108
+ by_date[item.timestamp] = item
109
+ end
110
+ by_date.keys.sort.map { |dt| yield by_date[dt] }
111
+ end
112
+
113
+ def info
114
+ inf = {}
115
+ el = base_doc.find_first("/atom:feed/atom:title", NS) ||
116
+ base_doc.find_first("/rss/channel/title")
117
+ inf["title"] = el.content
118
+ inf
119
+ end
120
+
121
+ def posts_feed
122
+ feed = read_base_doc
123
+ case @feed_type
124
+ when :atom
125
+ posts_feed_atom(feed)
126
+ when :rss
127
+ posts_feed_rss(feed)
128
+ end
129
+ end
130
+
131
+ def insert_posts(collection)
132
+ each do |post|
133
+ $stderr.puts "adding #{post.file}"
134
+ collection << collection.doc.import(post.doc.root)
135
+ end
136
+ collection
137
+ end
138
+
139
+ def each_unsorted
140
+ Dir.glob("#{@path}/post-*.xml") do |filename|
141
+ post = LibXML::XML::Parser.file(filename).parse
142
+ yield post, filename
143
+ end
144
+ end
145
+
146
+ def posts_feed_atom(doc)
147
+ insert_posts(doc.root)
148
+ doc
149
+ end
150
+
151
+ def posts_feed_rss(rssdoc)
152
+ doc = LibXML::XML::Parser.file("#{@path}/channel-1.xml").parse
153
+ channel = doc.find_first("/rss/channel");
154
+ insert_posts(channel)
155
+ doc
156
+ end
157
+
158
+ def sort_entries(repo_entries)
159
+ by_date = repo_entries.map do |e|
160
+ { :entry => e,
161
+ :time => Item.new(LibXML::XML::Parser.file(e.path).parse, e.path ).timestamp }
162
+ end
163
+ by_date.sort! { |a,b| a[:time] <=> b[:time] }
164
+ by_date.map { |hash| hash[:entry] }
165
+ end
166
+ end
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'abelard/dir.rb'
4
+
5
+ dir = Directory.new(ARGV[0])
6
+ puts dir.posts_feed.to_s
@@ -0,0 +1,142 @@
1
+ # Manage a git repository representing the feed
2
+
3
+ require 'rugged'
4
+ require 'pathname'
5
+
6
+ class History
7
+ # archive is a Directory, dir is a path to store in git
8
+ def initialize(archive, dir)
9
+ @archive = archive
10
+ if File.directory? dir
11
+ begin
12
+ repo = Rugged::Repository.discover(dir)
13
+
14
+ repo_base = Pathname.new(repo.workdir).realpath.to_s
15
+ real_dir = Pathname.new(dir).realpath.to_s
16
+ raise "confused! #{repo_base} #{real_dir}" unless real_dir.start_with?(repo_base)
17
+ @relative_root = real_dir[repo_base.length+1..-1] || ""
18
+ $stderr.puts "#{real_dir} in #{repo_base} : #{@relative_root}"
19
+
20
+ check_repo_clean(repo, @relative_root)
21
+ rescue Rugged::RepositoryError
22
+ repo = Rugged::Repository.init_at(dir)
23
+ @relative_root = ""
24
+ end
25
+ elsif File.exist? dir
26
+ fail "#{dir} exists as file"
27
+ else
28
+ Dir.mkdir(dir)
29
+ repo = Rugged::Repository.init_at(dir)
30
+ @relative_root = ""
31
+ end
32
+ @repo = repo
33
+ @dir_path = dir
34
+ end
35
+
36
+ class Entry
37
+ # dir_fn is the path to the file relative to the feed directory
38
+ # git_fn is the path to the file relative to the git root
39
+ # path is the full path
40
+
41
+ attr_reader :git_fn, :dir_fn, :path
42
+ def initialize(f, root, repository)
43
+ @git_fn = f
44
+
45
+ @dir_fn = if root.empty?
46
+ f
47
+ else
48
+ f[root.length+1..-1]
49
+ end
50
+
51
+ @path = repository.workdir + '/' + @dir_fn
52
+ end
53
+ end
54
+
55
+ def entry(from_git)
56
+ Entry.new(from_git, @relative_root, @repo)
57
+ end
58
+
59
+ def check_repo_clean(repo, sub)
60
+ $stderr.puts "check_repo_clean(#{repo},#{sub})"
61
+ clean = true
62
+ repo.status do |file, data|
63
+ change = classify_file(sub, file)
64
+ clean = false if change == :real
65
+ end
66
+ clean
67
+ end
68
+
69
+ def commit_posts
70
+ repo = @repo
71
+ sub = @relative_root
72
+
73
+ commits = 0
74
+ todo = { real: [] }
75
+ @repo.status do |file, data|
76
+ change = classify_file(sub, file)
77
+ todo[change] ||= []
78
+ todo[change] << file
79
+ end
80
+ if todo[:top]
81
+ todo[:top].each do |file|
82
+ repo.index.add file
83
+ repo_entry = entry(file)
84
+ item = Item.new(LibXML::XML::Parser.file(repo_entry.path).parse, repo_entry.path)
85
+
86
+ author = {:email => "#{item.author}@example.org",
87
+ :time => item.timestamp,
88
+ :name => item.author}
89
+ parents = []
90
+ parents << repo.head.target unless repo.head_unborn?
91
+ commit = Rugged::Commit.create(repo,
92
+ :author => author,
93
+ :message => "feed info",
94
+ :committer => author,
95
+ :parents => parents,
96
+ :tree => repo.index.write_tree(repo),
97
+ :update_ref => "HEAD")
98
+ commits = commits+1
99
+ end
100
+ end
101
+
102
+ to_commit = @archive.sort_entries(todo[:real].map { |f| entry(f) })
103
+
104
+ to_commit.each do |entry|
105
+ file = entry.git_fn
106
+ repo.index.add file
107
+ repo_entry = entry(file)
108
+ item = Item.new(LibXML::XML::Parser.file(repo_entry.path).parse, repo_entry.path)
109
+
110
+ author = {:email => "#{item.author}@example.org",
111
+ :time => item.timestamp,
112
+ :name => item.author}
113
+
114
+ $stderr.puts "Adding #{file}"
115
+
116
+ commit = Rugged::Commit.create(repo,
117
+ :author => author,
118
+ :message => "post",
119
+ :committer => author,
120
+ :parents => [repo.head.target],
121
+ :tree => repo.index.write_tree(repo),
122
+ :update_ref => "HEAD")
123
+ commits = commits+1
124
+ end
125
+
126
+ repo.index.write if commits > 0
127
+ end
128
+
129
+ def classify_file(subdir, file)
130
+ # normally 1 archive = 1 repo, but if you have a repo of several
131
+ # archives, ignore file changes outside
132
+ return :outside unless file.start_with?(subdir)
133
+
134
+ filename = Pathname.new(file).basename.to_s
135
+
136
+ return :real if filename.start_with?("post-") or filename.start_with?("comment-")
137
+ return :top if filename.start_with?("feed") or filename.start_with?("channel")
138
+
139
+ return :unknown
140
+ end
141
+ end
142
+
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'abelard/dir.rb'
5
+ CONFIG_FILE = "blogfeeds.yaml"
6
+
7
+ dest=''
8
+ if ARGV.length > 0 then
9
+ if (ARGV[0] == '-h') then
10
+ $stderr.puts("abelard list -d <dir>\nabelard list <config-entry>
11
+ abelard list\n")
12
+ else
13
+ if (ARGV[0] == '-d') then
14
+ dest = ARGV[1]
15
+ else
16
+ configs = YAML.load_file(CONFIG_FILE)
17
+ conf = configs[ARGV[0]]
18
+ dest = conf['dest']
19
+ end
20
+ dir = Directory.new(dest)
21
+ dir.each do |item|
22
+ printf("%s %s\n", item.timestamp.strftime("%Y-%m-%d"), item.title)
23
+ end
24
+ end
25
+ else
26
+ configs = YAML.load_file(CONFIG_FILE)
27
+ configs.each do |name, conf|
28
+ puts("#{name}: #{conf['urls'].first}")
29
+ end
30
+ end
31
+
@@ -0,0 +1,360 @@
1
+ #!/usr/bin/env ruby
2
+ require 'libxml'
3
+ require 'net/http'
4
+ require 'yaml'
5
+
6
+ require 'abelard/dir.rb'
7
+ require 'abelard/postxml.rb'
8
+
9
+ CONFIG_FILE = "blogfeeds.yaml"
10
+
11
+ Debug = true
12
+
13
+
14
+ module Splitter
15
+ def item(xmlnode, filename)
16
+ filedoc = LibXML::XML::Document.new()
17
+ filedoc.root = xmlnode.copy(true)
18
+ item = Item.new(filedoc, filename)
19
+ end
20
+
21
+ # deprecated
22
+ def write_item(xmlnode, file)
23
+ filedoc = LibXML::XML::Document.new()
24
+ filedoc.root = xmlnode.copy(true)
25
+ puts("writing #{file}")
26
+ filedoc.save(file, :indent => true, :encoding => LibXML::XML::Encoding::UTF_8)
27
+ end
28
+
29
+ # stream the document to a string and reparse it to clean up redundant namespaces
30
+ def write_doc_clean(doc, file)
31
+ cleandoc = LibXML::XML::Parser.string(doc.to_s, :options => LibXML::XML::Parser::Options::NSCLEAN).parse
32
+ cleandoc.save(file)
33
+ end
34
+ end
35
+
36
+ class Splitter_rss
37
+ include Splitter
38
+ NS = ['wp:http://wordpress.org/export/1.2/'];
39
+
40
+ def initialize(document, destination)
41
+ @doc = document
42
+ @dest = destination
43
+ end
44
+
45
+ def extract_comments(item)
46
+ # In a wordpress export, the comments are in wp:comment elements
47
+ basename = Post_id_rss.new(item)
48
+
49
+ all = []
50
+ comment_nodes = item.find("wp:comment", NS)
51
+ comment_nodes.each do |node|
52
+ comment_doc = LibXML::XML::Document.new()
53
+ comment_doc.root = node.remove!
54
+ approved = comment_doc.find_first("/wp:comment/wp:comment_approved", NS)
55
+ author_email = comment_doc.find_first("/wp:comment/wp:comment_author_email", NS)
56
+ author_ip = comment_doc.find_first("/wp:comment/wp:comment_author_IP", NS)
57
+ id = comment_doc.find_first("/wp:comment/wp:comment_id", NS)
58
+
59
+ # delete some sensitive fields
60
+ author_email.remove! if (author_email)
61
+ author_ip.remove! if (author_ip)
62
+
63
+ if (approved && (approved.content == '1'))
64
+ filename = basename.as_comment(id.content)
65
+
66
+ all << Item.new(comment_doc, "#{@dest}/#{filename}")
67
+ end
68
+ end
69
+ all
70
+ end
71
+
72
+ def split_items
73
+ channel_count = 1
74
+ rss = @doc.root
75
+ @parent = LibXML::XML::Document.new()
76
+ root = LibXML::XML::Node.new(rss.name)
77
+ @parent.root = root
78
+ rss.attributes.each { |a| root.attributes[a.name] = a.value }
79
+ rss.children.select(&:element?).each do |channel|
80
+ if (channel.name == "channel")
81
+ root << channel.clone # shallow copy for feed.xml
82
+
83
+ channelself = XmlUtil::self_link(channel)
84
+ is_comments = (channelself =~ /comments/)
85
+
86
+ copy = LibXML::XML::Node.new(channel.name)
87
+ channel.attributes.each { |a| copy.attributes[a.name] = a.value }
88
+ channel.children.select(&:element?).each do |node|
89
+ $stderr.puts(node.name)
90
+ if (node.name == "item")
91
+ # attachments dont get saved as posts
92
+ if ( node.find("wp:attachment_url", "wp:http://wordpress.org/export/1.2/").length > 0 )
93
+ $stderr.puts("skipping attachment")
94
+ else
95
+ # in a wordpress export file, comments are included inside the post item
96
+ comments = extract_comments(node)
97
+ save(node)
98
+ comments.each { |c| c.save }
99
+ end
100
+ else
101
+ copy << node.copy(true)
102
+ end
103
+ end
104
+ ch_copy = root.copy(true)
105
+ ch_copy << copy
106
+ unless is_comments
107
+ channel_doc = LibXML::XML::Document.new()
108
+ channel_doc.root = ch_copy
109
+ channel_doc.save("#{@dest}/channel-#{channel_count}.xml")
110
+ end
111
+ channel_count = channel_count + 1
112
+ else
113
+ root << channel
114
+ end
115
+ end
116
+ @parent.save("#{@dest}/feed.xml")
117
+ end
118
+
119
+ def save(node)
120
+ filename = Post_id_rss.new(node).to_s
121
+ new_item = item(node, "#{@dest}/#{filename}")
122
+ if new_item.status == :published
123
+ new_item.save
124
+ else
125
+ $stderr.puts("skipping #{filename} as status #{new_item.status}")
126
+ end
127
+ end
128
+ end
129
+
130
+ class Splitter_atom
131
+ include Splitter
132
+ def initialize(document, destination)
133
+ @doc = document
134
+ @dest = destination
135
+ end
136
+
137
+ def split_items
138
+ feed = @doc.root
139
+
140
+ feedself = XmlUtil::self_link(feed)
141
+
142
+ @feed_type = nil # unknown
143
+ @feed_type = "post" if (feedself =~ %r{/posts/default$})
144
+ @feed_type = "comment" if (feedself =~ %r{/comments/default$})
145
+
146
+ @parent = LibXML::XML::Document.new()
147
+ root = LibXML::XML::Node.new(feed.name)
148
+ @parent.root = root
149
+ feed.namespaces.definitions.each {|ns| LibXML::XML::Namespace.new(root, ns.prefix, ns.href)}
150
+ feed.attributes.each { |a| root.attributes[a.name] = a.value }
151
+
152
+ feed.children.select(&:element?).each do |node|
153
+ if (node.name == "entry")
154
+ save(node)
155
+ else
156
+ root << @parent.import(node)
157
+ end
158
+ end
159
+
160
+ write_doc_clean(@parent, "#{@dest}/feed.xml")
161
+ end
162
+
163
+ def save(node)
164
+ id = node.children.find { |n| n.name == "id" }
165
+ id = id && id.content
166
+
167
+ path = XmlUtil::self_link(node)
168
+
169
+ case node.name
170
+ when "entry"
171
+ category = XmlUtil::child_attribute(node, "category", "term")
172
+
173
+ if @feed_type
174
+ entry_type = @feed_type
175
+ else
176
+ entry_type = category.split('#').last if category
177
+ end
178
+
179
+ case entry_type
180
+ when "post"
181
+ postnumber = path.split('/').last
182
+ filename = "#{@dest}/post-#{postnumber}.xml"
183
+ write_item(node, filename)
184
+ when "comment"
185
+ pathsplit = path.split('/')
186
+ postnumber = pathsplit[-4]
187
+ commentnumber = pathsplit[-1]
188
+ filename = "#{@dest}/comment-#{postnumber}-#{commentnumber}.xml"
189
+ write_item(node,filename)
190
+ end
191
+ end
192
+ end
193
+ end
194
+
195
+ class SourceBase
196
+ def process(parser, destination)
197
+ doc = parser.parse
198
+
199
+ case doc.root.name
200
+ when "feed"
201
+ atom = Splitter_atom.new(doc, destination)
202
+ atom.split_items
203
+ when "rss"
204
+ rss = Splitter_rss.new(doc, destination)
205
+ rss.split_items
206
+ else
207
+ puts "don't know what to do with element #{doc.root.name}"
208
+ end
209
+
210
+ archive = Directory.new(destination)
211
+ archive.save
212
+ end
213
+ end
214
+
215
+ class FileSource < SourceBase
216
+ def initialize(filename, dest)
217
+ @file = filename
218
+ @dest = dest
219
+ end
220
+
221
+ def load
222
+ ensure_dest(@dest)
223
+ parser = LibXML::XML::Parser.file(@file)
224
+ process(parser, @dest)
225
+ end
226
+ end
227
+
228
+ class Source < SourceBase
229
+ def initialize(conf_in)
230
+ if conf_in.respond_to?(:keys)
231
+ @conf = conf_in
232
+ else
233
+ @conf = get_config(conf_in) || die("No config for #{conf_in}")
234
+ end
235
+ end
236
+
237
+ def load
238
+ conf = @conf
239
+ dest = conf["dest"] || die("No 'dest' directory defined")
240
+ urls = conf["urls"] || die("No urls defined")
241
+
242
+ ensure_dest(dest)
243
+
244
+ fetcher = Fetcher.new
245
+ fetcher.user = conf["user"]
246
+ fetcher.password = conf["password"]
247
+
248
+ urls.each do |urlpath|
249
+ url = URI(urlpath)
250
+ fetcher.get(url) do |response|
251
+ write_raw(response.body, "#{dest}/raw.xml") if Debug
252
+ parser = LibXML::XML::Parser.string(response.body)
253
+ process(parser, dest)
254
+ end
255
+ end
256
+ end
257
+
258
+ def all_configs
259
+ YAML.load_file(CONFIG_FILE) || {}
260
+ end
261
+
262
+ def get_config(key)
263
+ configuration_file = all_configs
264
+ configuration_file[key]
265
+ end
266
+
267
+ def save_config(key, conf)
268
+ configuration_data = all_configs
269
+ if configuration_data.has_key? key
270
+ die("Already have config for #{key}")
271
+ else
272
+ configuration_data[key] = conf
273
+ open(CONFIG_FILE, 'w+') do |conf_file|
274
+ conf_file.puts(configuration_data.to_yaml)
275
+ end
276
+ end
277
+ end
278
+
279
+ def write_raw(data, filename)
280
+ File.open(filename, "w") { |f| f.write(data) }
281
+ end
282
+ end
283
+
284
+ def die(error)
285
+ puts error
286
+ exit 1
287
+ end
288
+
289
+ # Wrap an HTTP session, handle making a request and
290
+ # following any redirects
291
+ class Fetcher
292
+ def initialize
293
+ @host = nil
294
+ @session = nil
295
+ @user = nil
296
+ @password = nil
297
+ end
298
+ attr_accessor :user, :password
299
+
300
+ def get(url, max_depth=3)
301
+ if (url.host != @host)
302
+ @host = url.host
303
+ @session = Net::HTTP.new(@host, url.port)
304
+ @session.use_ssl = (url.scheme == 'https')
305
+ end
306
+ request = Net::HTTP::Get.new(url)
307
+ request.basic_auth(user, password) if user
308
+
309
+ msg = "Reading (#{url.to_s})"
310
+ msg << " as #{user}" if user
311
+ $stderr.puts(msg)
312
+
313
+ feedxml = @session.request(request)
314
+ if (feedxml.is_a? Net::HTTPOK)
315
+ yield feedxml
316
+ elsif (feedxml.is_a? Net::HTTPMovedPermanently )
317
+ new_url = feedxml['Location']
318
+ if ( new_url == url.to_s ) then
319
+ puts("Confused! redirect to same url #{new_url}")
320
+ else
321
+ if ( max_depth == 0 )
322
+ puts("Too many redirects")
323
+ else
324
+ puts("Redirecting to #{new_url}")
325
+ get(URI(new_url), max_depth-1) { |r| yield r }
326
+ end
327
+ end
328
+ else
329
+ puts("GET returned #{feedxml.code}")
330
+ puts(feedxml.body)
331
+ end
332
+ end
333
+ end
334
+
335
+ def ensure_dest(dest)
336
+ Dir::mkdir(dest) unless File.directory?(dest)
337
+ unless File.directory?(dest)
338
+ $stderr.puts "Could not create directory #{dest}"
339
+ end
340
+ end
341
+
342
+ if ARGV.empty?
343
+ $stderr.puts "Syntax:\nabelard load -f <filename> <destination>\nabelard load {-n <url>} ... <destination>\nabelard load <config>"
344
+ exit(1)
345
+ elsif ARGV[0] == "-f"
346
+ source = FileSource.new(ARGV[1], ARGV[2])
347
+ elsif ARGV[0] == '-n'
348
+ urls = []
349
+ while ARGV[0] == '-n'
350
+ ARGV.shift
351
+ urls << ARGV.shift
352
+ end
353
+ conf = {"urls" => urls, "dest" => ARGV[0]}
354
+ source = Source.new(conf)
355
+ source.save_config(conf["dest"],conf)
356
+ else
357
+ key = ARGV[0]
358
+ source = Source.new(key)
359
+ end
360
+ source.load
@@ -0,0 +1,80 @@
1
+ require 'uri'
2
+
3
+ module XmlUtil
4
+ def self.child_content(node, elementname)
5
+ el = node.children.find { |n| n.name == elementname }
6
+ el && el.content
7
+ end
8
+ def self.child_attribute(node, elementname, attributename)
9
+ el = node.children.find { |n| n.name == elementname }
10
+ attr = el && el.attributes.get_attribute("term")
11
+ attr && attr.value
12
+ end
13
+ def self.with_attribute(node, attributename, attributevalue)
14
+ a = node.attributes.get_attribute(attributename)
15
+ a && (a.value == attributevalue)
16
+ end
17
+ def self.self_link(node)
18
+ el = node.children.find { |l| (l.name == "link") && with_attribute(l, "rel", "self") }
19
+ el && el.attributes.get_attribute("href").value
20
+ end
21
+ end
22
+
23
+
24
+ class Post_id_rss
25
+ attr_reader :idurl
26
+ def initialize(postxml)
27
+ #XmlUtil::child_content(postxml, "post_id") ||
28
+ @idurl = XmlUtil::child_content(postxml, "guid")
29
+ @raw = postxml.to_s
30
+ end
31
+
32
+ def to_s
33
+ if !idurl
34
+ improvise
35
+ else
36
+ postnumber = post_match
37
+ commenturl = /\?p(age_id)?=(\d+)(\.xml)?#comment-(.*)$/.match(idurl) ||
38
+ /^(.*)\/(\d{4}\/.*)\/#(comment)-(.*)$/.match(idurl)
39
+
40
+ if commenturl
41
+ postnumber = commenturl[2].sub(/^\//,'').sub(/\.xml$/,'').gsub('/','-')
42
+ commentnumber = commenturl[4]
43
+ "comment-#{postnumber}-#{commentnumber}.xml"
44
+ else
45
+ "post-#{postnumber}.xml"
46
+ end
47
+ end
48
+ end
49
+
50
+ def post_match
51
+ posturl = /\?p(age_id)?=(\d+)(\.xml)?$/.match(idurl)
52
+ if posturl
53
+ posturl[2]
54
+ else
55
+ sanitize
56
+ end
57
+ end
58
+
59
+ def as_comment(commentnumber)
60
+ "comment-#{post_match}-#{commentnumber}.xml"
61
+ end
62
+
63
+ def sanitize
64
+ uri = URI(idurl)
65
+ $stderr.puts("Could not parse url #{idurl}") unless ( uri )
66
+ if ( uri.scheme == "tag" )
67
+ return idurl.split('-').last
68
+ end
69
+
70
+ build = uri.path.sub(/^\//,'').sub(/\.xml$/,'').gsub('/','-')
71
+ build.concat('-' + uri.query.gsub(/[?&]/,'-')) if uri.query
72
+ build.concat('-' + uri.fragment) if uri.fragment
73
+ build
74
+ end
75
+
76
+ def improvise
77
+ "post-%016x.xml" % @raw.hash
78
+ end
79
+ end
80
+
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+ require 'sinatra/base'
3
+ require 'yaml'
4
+ require 'abelard/dir.rb'
5
+ require 'abelard/archive.rb'
6
+
7
+ CONFIG_FILE="blogfeeds.yaml"
8
+
9
+
10
+
11
+ class FeedServer < Sinatra::Base
12
+ archive = Archive.new(CONFIG_FILE)
13
+ set :bind, "0.0.0.0"
14
+
15
+ get '/' do
16
+ template = <<ERB
17
+ <html><body><h2>Feeds</h2><dl>
18
+ <% archive.available.each do |blog| %>
19
+ <dt><%= blog %></dt>
20
+ <dd><%= archive.dir(blog).info["title"] %></dd>
21
+ <dd><a href="/<%= blog %>/posts">posts</a></dd>
22
+ <% end %>
23
+ </dl></body></html>
24
+ ERB
25
+ erb template, :locals => { :archive => archive }
26
+ end
27
+
28
+ get '/*/posts' do |blog|
29
+ headers "Content-Type" => "application/atom+xml"
30
+ archive.dir(blog).posts_feed.to_s
31
+ end
32
+
33
+ run!
34
+ end
35
+
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: abelard
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Anomaly UK
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-08-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rugged
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.23'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.23'
27
+ description: Persist blogs and similar web content as sharable git repositories
28
+ email: anomalyuk@tesco.net
29
+ executables:
30
+ - abelard
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/abelard
35
+ - lib/abelard/archive.rb
36
+ - lib/abelard/dir.rb
37
+ - lib/abelard/dump.rb
38
+ - lib/abelard/history.rb
39
+ - lib/abelard/list.rb
40
+ - lib/abelard/load.rb
41
+ - lib/abelard/postxml.rb
42
+ - lib/abelard/web.rb
43
+ homepage: http://anomalyuk.blogspot.com/
44
+ licenses:
45
+ - GPL-2.0
46
+ metadata: {}
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubyforge_project:
63
+ rubygems_version: 2.5.2
64
+ signing_key:
65
+ specification_version: 4
66
+ summary: Abelard blog archiver
67
+ test_files: []