bunto-import 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class RSS < Importer
4
+ def self.specify_options(c)
5
+ c.option 'source', '--source NAME', 'The RSS file or URL to import'
6
+ end
7
+
8
+ def self.validate(options)
9
+ if options['source'].nil?
10
+ abort "Missing mandatory option --source."
11
+ end
12
+ end
13
+
14
+ def self.require_deps
15
+ BuntoImport.require_with_fallback(%w[
16
+ rss
17
+ rss/1.0
18
+ rss/2.0
19
+ open-uri
20
+ fileutils
21
+ safe_yaml
22
+ ])
23
+ end
24
+
25
+ # Process the import.
26
+ #
27
+ # source - a URL or a local file String.
28
+ #
29
+ # Returns nothing.
30
+ def self.process(options)
31
+ source = options.fetch('source')
32
+
33
+ content = ""
34
+ open(source) { |s| content = s.read }
35
+ rss = ::RSS::Parser.parse(content, false)
36
+
37
+ raise "There doesn't appear to be any RSS items at the source (#{source}) provided." unless rss
38
+
39
+ rss.items.each do |item|
40
+ formatted_date = item.date.strftime('%Y-%m-%d')
41
+ post_name = item.title.split(%r{ |!|/|:|&|-|$|,}).map do |i|
42
+ i.downcase if i != ''
43
+ end.compact.join('-')
44
+ name = "#{formatted_date}-#{post_name}"
45
+
46
+ header = {
47
+ 'layout' => 'post',
48
+ 'title' => item.title
49
+ }
50
+
51
+ FileUtils.mkdir_p("_posts")
52
+
53
+ File.open("_posts/#{name}.html", "w") do |f|
54
+ f.puts header.to_yaml
55
+ f.puts "---\n\n"
56
+ f.puts item.description
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,60 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class S9Y < Importer
4
+ def self.specify_options(c)
5
+ c.option 'source', '--source SOURCE', 'The URL of the S9Y RSS feed'
6
+ end
7
+
8
+ def self.validate(options)
9
+ if options['source'].nil?
10
+ abort "Missing mandatory option --source, e.g. --source \"http://blog.example.com/rss.php?version=2.0&all=1\""
11
+ end
12
+ end
13
+
14
+ def self.require_deps
15
+ BuntoImport.require_with_fallback(%w[
16
+ open-uri
17
+ rss
18
+ fileutils
19
+ safe_yaml
20
+ ])
21
+ end
22
+
23
+ def self.process(options)
24
+ source = options.fetch('source')
25
+
26
+ FileUtils.mkdir_p("_posts")
27
+
28
+ text = ''
29
+ open(source) { |line| text = line.read }
30
+ rss = ::RSS::Parser.parse(text)
31
+
32
+ rss.items.each do |item|
33
+ post_url = item.link.match('.*(/archives/.*)')[1]
34
+ categories = item.categories.collect { |c| c.content }
35
+ content = item.content_encoded.strip
36
+ date = item.date
37
+ slug = item.link.match('.*/archives/[0-9]+-(.*)\.html')[1]
38
+ name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, date.day,
39
+ slug]
40
+
41
+ data = {
42
+ 'layout' => 'post',
43
+ 'title' => item.title,
44
+ 'categories' => categories,
45
+ 'permalink' => post_url,
46
+ 's9y_link' => item.link,
47
+ 'date' => item.date,
48
+ }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
49
+
50
+ # Write out the data and content to file
51
+ File.open("_posts/#{name}", "w") do |f|
52
+ f.puts data
53
+ f.puts "---"
54
+ f.puts content
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,70 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class TextPattern < Importer
4
+ # Reads a MySQL database via Sequel and creates a post file for each post.
5
+ # The only posts selected are those with a status of 4 or 5, which means
6
+ # "live" and "sticky" respectively.
7
+ # Other statuses are 1 => draft, 2 => hidden and 3 => pending.
8
+ QUERY = "SELECT Title, \
9
+ url_title, \
10
+ Posted, \
11
+ Body, \
12
+ Keywords \
13
+ FROM textpattern \
14
+ WHERE Status = '4' OR \
15
+ Status = '5'"
16
+
17
+ def self.require_deps
18
+ BuntoImport.require_with_fallback(%w[
19
+ rubygems
20
+ sequel
21
+ fileutils
22
+ safe_yaml
23
+ ])
24
+ end
25
+
26
+ def self.specify_options(c)
27
+ c.option 'dbname', '--dbname DB', 'Database name'
28
+ c.option 'user', '--user USER', 'Database user name'
29
+ c.option 'password', '--password PW', "Database user's password"
30
+ c.option 'host', '--host HOST', 'Database host name (default: "localhost")'
31
+ end
32
+
33
+ def self.process(options)
34
+ dbname = options.fetch('dbname')
35
+ user = options.fetch('user')
36
+ pass = options.fetch('password', "")
37
+ host = options.fetch('host', "localhost")
38
+
39
+ db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
40
+
41
+ FileUtils.mkdir_p "_posts"
42
+
43
+ db[QUERY].each do |post|
44
+ # Get required fields and construct Bunto compatible name.
45
+ title = post[:Title]
46
+ slug = post[:url_title]
47
+ date = post[:Posted]
48
+ content = post[:Body]
49
+
50
+ name = [date.strftime("%Y-%m-%d"), slug].join('-') + ".textile"
51
+
52
+ # Get the relevant fields as a hash, delete empty fields and convert
53
+ # to YAML for the header.
54
+ data = {
55
+ 'layout' => 'post',
56
+ 'title' => title.to_s,
57
+ 'tags' => post[:Keywords].split(',')
58
+ }.delete_if { |k,v| v.nil? || v == ''}.to_yaml
59
+
60
+ # Write out the data and content to file.
61
+ File.open("_posts/#{name}", "w") do |f|
62
+ f.puts data
63
+ f.puts "---"
64
+ f.puts content
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,289 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class Tumblr < Importer
4
+ def self.require_deps
5
+ BuntoImport.require_with_fallback(%w[
6
+ rubygems
7
+ fileutils
8
+ open-uri
9
+ nokogiri
10
+ json
11
+ uri
12
+ time
13
+ bunto
14
+ ])
15
+ end
16
+
17
+ def self.specify_options(c)
18
+ c.option 'url', '--url URL', 'Tumblr URL'
19
+ c.option 'format', '--format FORMAT', 'Output format (default: "html")'
20
+ c.option 'grab_images', '--grab_images', 'Whether to grab images (default: false)'
21
+ c.option 'add_highlights', '--add_highlights', 'Whether to add highlights (default: false)'
22
+ c.option 'rewrite_urls', '--rewrite_urls', 'Whether to rewrite URLs (default: false)'
23
+ end
24
+
25
+ def self.process(options)
26
+ url = options.fetch('url')
27
+ format = options.fetch('format', "html")
28
+ grab_images = options.fetch('grab_images', false)
29
+ add_highlights = options.fetch('add_highlights', false)
30
+ rewrite_urls = options.fetch('rewrite_urls', false)
31
+
32
+ @grab_images = grab_images
33
+ FileUtils.mkdir_p "_posts/tumblr"
34
+ url += "/api/read/json/"
35
+ per_page = 50
36
+ posts = []
37
+ # Two passes are required so that we can rewrite URLs.
38
+ # First pass builds up an array of each post as a hash.
39
+ begin
40
+ current_page = (current_page || -1) + 1
41
+ feed_url = url + "?num=#{per_page}&start=#{current_page * per_page}"
42
+ puts "Fetching #{feed_url}"
43
+ feed = open(feed_url)
44
+ json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars.
45
+ blog = JSON.parse(json)
46
+ puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
47
+ batch = blog["posts"].map { |post| post_to_hash(post, format) }
48
+
49
+ # If we're rewriting, save the posts for later. Otherwise, go ahead and
50
+ # dump these to disk now
51
+ if rewrite_urls
52
+ posts += batch
53
+ else
54
+ batch.each {|post| write_post(post, format == "md", add_highlights)}
55
+ end
56
+
57
+ end until blog["posts"].size < per_page
58
+
59
+ # Rewrite URLs, create redirects and write out out posts if necessary
60
+ if rewrite_urls
61
+ posts = rewrite_urls_and_redirects posts
62
+ posts.each {|post| write_post(post, format == "md", add_highlights)}
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ # Writes a post out to disk
69
+ def self.write_post(post, use_markdown, add_highlights)
70
+ content = post[:content]
71
+
72
+ if content
73
+ if use_markdown
74
+ content = html_to_markdown content
75
+ if add_highlights
76
+ tumblr_url = URI.parse(post[:slug]).path
77
+ redirect_dir = tumblr_url.sub(/\//, "") + "/"
78
+ FileUtils.mkdir_p redirect_dir
79
+ content = add_syntax_highlights(content, redirect_dir)
80
+ end
81
+ end
82
+
83
+ File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
84
+ f.puts post[:header].to_yaml + "---\n" + content
85
+ end
86
+ end
87
+ end
88
+
89
+ # Converts each type of Tumblr post to a hash with all required
90
+ # data for Bunto.
91
+ def self.post_to_hash(post, format)
92
+ case post['type']
93
+ when "regular"
94
+ title = post["regular-title"]
95
+ content = post["regular-body"]
96
+ when "link"
97
+ title = post["link-text"] || post["link-url"]
98
+ content = "<a href=\"#{post["link-url"]}\">#{title}</a>"
99
+ unless post["link-description"].nil?
100
+ content << "<br/>" + post["link-description"]
101
+ end
102
+ when "photo"
103
+ title = post["slug"].gsub("-"," ")
104
+ if post["photos"].size > 1
105
+ content = ""
106
+ post["photos"].each do |post_photo|
107
+ photo = fetch_photo post_photo
108
+ content << photo + "<br/>"
109
+ content << post_photo["caption"]
110
+ end
111
+ else
112
+ content = fetch_photo post
113
+ end
114
+ content << "<br/>" + post["photo-caption"]
115
+ when "audio"
116
+ if !post["id3-title"].nil?
117
+ title = post["id3-title"]
118
+ content = post["audio-player"] + "<br/>" + post["audio-caption"]
119
+ else
120
+ title = post["audio-caption"]
121
+ content = post["audio-player"]
122
+ end
123
+ when "quote"
124
+ title = post["quote-text"]
125
+ content = "<blockquote>#{post["quote-text"]}</blockquote>"
126
+ unless post["quote-source"].nil?
127
+ content << "&#8212;" + post["quote-source"]
128
+ end
129
+ when "conversation"
130
+ title = post["conversation-title"]
131
+ content = "<section><dialog>"
132
+ post["conversation"].each do |line|
133
+ content << "<dt>#{line['label']}</dt><dd>#{line['phrase']}</dd>"
134
+ end
135
+ content << "</section></dialog>"
136
+ when "video"
137
+ title = post["video-title"]
138
+ content = post["video-player"]
139
+ unless post["video-caption"].nil?
140
+ unless content.nil?
141
+ content << "<br/>" + post["video-caption"]
142
+ else
143
+ content = post["video-caption"]
144
+ end
145
+ end
146
+ when "answer"
147
+ title = post["question"]
148
+ content = post["answer"]
149
+ end
150
+ date = Date.parse(post['date']).to_s
151
+ title = Nokogiri::HTML(title).text
152
+ title = "no title" if title.empty?
153
+ slug = if post["slug"] && post["slug"].strip != ""
154
+ post["slug"]
155
+ elsif title && title.downcase.gsub(/[^a-z0-9\-]/, '') != '' && title != 'no title'
156
+ slug = title.downcase.strip.gsub(' ', '-').gsub(/[^a-z0-9\-]/, '')
157
+ slug.length > 200 ? slug.slice(0..200) : slug
158
+ else
159
+ slug = post['id']
160
+ end
161
+ {
162
+ :name => "#{date}-#{slug}.#{format}",
163
+ :header => {
164
+ "layout" => "post",
165
+ "title" => title,
166
+ "date" => Time.parse(post['date']).xmlschema,
167
+ "tags" => (post["tags"] or []),
168
+ "tumblr_url" => post["url-with-slug"]
169
+ },
170
+ :content => content,
171
+ :url => post["url"],
172
+ :slug => post["url-with-slug"],
173
+ }
174
+ end
175
+
176
+ # Attempts to fetch the largest version of a photo available for a post.
177
+ # If that file fails, it tries the next smaller size until all available
178
+ # photo URLs are exhausted. If they all fail, the import is aborted.
179
+ def self.fetch_photo(post)
180
+ sizes = post.keys.map {|k| k.gsub("photo-url-", "").to_i}
181
+ sizes.sort! {|a,b| b <=> a}
182
+
183
+ ext_key, ext_val = post.find do |k,v|
184
+ k =~ /^photo-url-/ && v.split("/").last =~ /\./
185
+ end
186
+ ext = "." + ext_val.split(".").last
187
+
188
+ sizes.each do |size|
189
+ url = post["photo-url"] || post["photo-url-#{size}"]
190
+ next if url.nil?
191
+ begin
192
+ return "<img src=\"#{save_photo(url, ext)}\"/>"
193
+ rescue OpenURI::HTTPError => err
194
+ puts "Failed to grab photo"
195
+ end
196
+ end
197
+
198
+ abort "Failed to fetch photo for post #{post['url']}"
199
+ end
200
+
201
+ # Create a Hash of old urls => new urls, for rewriting and
202
+ # redirects, and replace urls in each post. Instantiate Bunto
203
+ # site/posts to get the correct permalink format.
204
+ def self.rewrite_urls_and_redirects(posts)
205
+ site = Bunto::Site.new(Bunto.configuration({}))
206
+ urls = Hash[posts.map { |post|
207
+ # Create an initial empty file for the post so that
208
+ # we can instantiate a post object.
209
+ File.open("_posts/tumblr/#{post[:name]}", "w")
210
+ tumblr_url = URI.parse(URI.encode(post[:slug])).path
211
+ bunto_url = Bunto::Post.new(site, Dir.pwd, "", "tumblr/" + post[:name]).url
212
+ redirect_dir = tumblr_url.sub(/\//, "") + "/"
213
+ FileUtils.mkdir_p redirect_dir
214
+ File.open(redirect_dir + "index.html", "w") do |f|
215
+ f.puts "<html><head><link rel=\"canonical\" href=\"" +
216
+ "#{bunto_url}\"><meta http-equiv=\"refresh\" content=\"0; " +
217
+ "url=#{bunto_url}\"></head><body></body></html>"
218
+ end
219
+ [tumblr_url, bunto_url]
220
+ }]
221
+ posts.map { |post|
222
+ urls.each do |tumblr_url, bunto_url|
223
+ post[:content].gsub!(/#{tumblr_url}/i, bunto_url)
224
+ end
225
+ post
226
+ }
227
+ end
228
+
229
+ # Convert preserving HTML tables as per the markdown docs.
230
+ def self.html_to_markdown(content)
231
+ preserve = ["table", "tr", "th", "td"]
232
+ preserve.each do |tag|
233
+ content.gsub!(/<#{tag}/i, "$$" + tag)
234
+ content.gsub!(/<\/#{tag}/i, "||" + tag)
235
+ end
236
+ content = Nokogiri::HTML(content.gsub("'", "''")).text
237
+ preserve.each do |tag|
238
+ content.gsub!("$$" + tag, "<" + tag)
239
+ content.gsub!("||" + tag, "</" + tag)
240
+ end
241
+ content
242
+ end
243
+
244
+ # Adds pygments highlight tags to code blocks in posts that use
245
+ # markdown format. This doesn't guess the language of the code
246
+ # block, so you should modify this to suit your own content.
247
+ # For example, my code block only contain Python and JavaScript,
248
+ # so I can assume the block is JavaScript if it contains a
249
+ # semi-colon.
250
+ def self.add_syntax_highlights(content, redirect_dir)
251
+ lines = content.split("\n")
252
+ block, indent, lang, start = false, /^ /, nil, nil
253
+ lines.each_with_index do |line, i|
254
+ if !block && line =~ indent
255
+ block = true
256
+ lang = "python"
257
+ start = i
258
+ elsif block
259
+ lang = "javascript" if line =~ /;$/
260
+ block = line =~ indent && i < lines.size - 1 # Also handle EOF
261
+ if !block
262
+ lines[start] = "{% highlight #{lang} %}"
263
+ lines[i - 1] = "{% endhighlight %}"
264
+ end
265
+ FileUtils.cp(redirect_dir + "index.html", redirect_dir + "../" + "index.html")
266
+ lines[i] = lines[i].sub(indent, "")
267
+ end
268
+ end
269
+ lines.join("\n")
270
+ end
271
+
272
+ def self.save_photo(url, ext)
273
+ if @grab_images
274
+ path = "tumblr_files/#{url.split('/').last}"
275
+ path += ext unless path =~ /#{ext}$/
276
+ FileUtils.mkdir_p "tumblr_files"
277
+
278
+ # Don't fetch if we've already cached this file
279
+ unless File.size? path
280
+ puts "Fetching photo #{url}"
281
+ File.open(path, "w") { |f| f.write(open(url).read) }
282
+ end
283
+ url = "/" + path
284
+ end
285
+ url
286
+ end
287
+ end
288
+ end
289
+ end