bunto-import 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,62 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class RSS < Importer
4
+ def self.specify_options(c)
5
+ c.option 'source', '--source NAME', 'The RSS file or URL to import'
6
+ end
7
+
8
+ def self.validate(options)
9
+ if options['source'].nil?
10
+ abort "Missing mandatory option --source."
11
+ end
12
+ end
13
+
14
+ def self.require_deps
15
+ BuntoImport.require_with_fallback(%w[
16
+ rss
17
+ rss/1.0
18
+ rss/2.0
19
+ open-uri
20
+ fileutils
21
+ safe_yaml
22
+ ])
23
+ end
24
+
25
+ # Process the import.
26
+ #
27
+ # source - a URL or a local file String.
28
+ #
29
+ # Returns nothing.
30
+ def self.process(options)
31
+ source = options.fetch('source')
32
+
33
+ content = ""
34
+ open(source) { |s| content = s.read }
35
+ rss = ::RSS::Parser.parse(content, false)
36
+
37
+ raise "There doesn't appear to be any RSS items at the source (#{source}) provided." unless rss
38
+
39
+ rss.items.each do |item|
40
+ formatted_date = item.date.strftime('%Y-%m-%d')
41
+ post_name = item.title.split(%r{ |!|/|:|&|-|$|,}).map do |i|
42
+ i.downcase if i != ''
43
+ end.compact.join('-')
44
+ name = "#{formatted_date}-#{post_name}"
45
+
46
+ header = {
47
+ 'layout' => 'post',
48
+ 'title' => item.title
49
+ }
50
+
51
+ FileUtils.mkdir_p("_posts")
52
+
53
+ File.open("_posts/#{name}.html", "w") do |f|
54
+ f.puts header.to_yaml
55
+ f.puts "---\n\n"
56
+ f.puts item.description
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,60 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class S9Y < Importer
4
+ def self.specify_options(c)
5
+ c.option 'source', '--source SOURCE', 'The URL of the S9Y RSS feed'
6
+ end
7
+
8
+ def self.validate(options)
9
+ if options['source'].nil?
10
+ abort "Missing mandatory option --source, e.g. --source \"http://blog.example.com/rss.php?version=2.0&all=1\""
11
+ end
12
+ end
13
+
14
+ def self.require_deps
15
+ BuntoImport.require_with_fallback(%w[
16
+ open-uri
17
+ rss
18
+ fileutils
19
+ safe_yaml
20
+ ])
21
+ end
22
+
23
+ def self.process(options)
24
+ source = options.fetch('source')
25
+
26
+ FileUtils.mkdir_p("_posts")
27
+
28
+ text = ''
29
+ open(source) { |line| text = line.read }
30
+ rss = ::RSS::Parser.parse(text)
31
+
32
+ rss.items.each do |item|
33
+ post_url = item.link.match('.*(/archives/.*)')[1]
34
+ categories = item.categories.collect { |c| c.content }
35
+ content = item.content_encoded.strip
36
+ date = item.date
37
+ slug = item.link.match('.*/archives/[0-9]+-(.*)\.html')[1]
38
+ name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, date.day,
39
+ slug]
40
+
41
+ data = {
42
+ 'layout' => 'post',
43
+ 'title' => item.title,
44
+ 'categories' => categories,
45
+ 'permalink' => post_url,
46
+ 's9y_link' => item.link,
47
+ 'date' => item.date,
48
+ }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
49
+
50
+ # Write out the data and content to file
51
+ File.open("_posts/#{name}", "w") do |f|
52
+ f.puts data
53
+ f.puts "---"
54
+ f.puts content
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,70 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class TextPattern < Importer
4
+ # Reads a MySQL database via Sequel and creates a post file for each post.
5
+ # The only posts selected are those with a status of 4 or 5, which means
6
+ # "live" and "sticky" respectively.
7
+ # Other statuses are 1 => draft, 2 => hidden and 3 => pending.
8
+ QUERY = "SELECT Title, \
9
+ url_title, \
10
+ Posted, \
11
+ Body, \
12
+ Keywords \
13
+ FROM textpattern \
14
+ WHERE Status = '4' OR \
15
+ Status = '5'"
16
+
17
+ def self.require_deps
18
+ BuntoImport.require_with_fallback(%w[
19
+ rubygems
20
+ sequel
21
+ fileutils
22
+ safe_yaml
23
+ ])
24
+ end
25
+
26
+ def self.specify_options(c)
27
+ c.option 'dbname', '--dbname DB', 'Database name'
28
+ c.option 'user', '--user USER', 'Database user name'
29
+ c.option 'password', '--password PW', "Database user's password"
30
+ c.option 'host', '--host HOST', 'Database host name (default: "localhost")'
31
+ end
32
+
33
+ def self.process(options)
34
+ dbname = options.fetch('dbname')
35
+ user = options.fetch('user')
36
+ pass = options.fetch('password', "")
37
+ host = options.fetch('host', "localhost")
38
+
39
+ db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
40
+
41
+ FileUtils.mkdir_p "_posts"
42
+
43
+ db[QUERY].each do |post|
44
+ # Get required fields and construct Bunto compatible name.
45
+ title = post[:Title]
46
+ slug = post[:url_title]
47
+ date = post[:Posted]
48
+ content = post[:Body]
49
+
50
+ name = [date.strftime("%Y-%m-%d"), slug].join('-') + ".textile"
51
+
52
+ # Get the relevant fields as a hash, delete empty fields and convert
53
+ # to YAML for the header.
54
+ data = {
55
+ 'layout' => 'post',
56
+ 'title' => title.to_s,
57
+ 'tags' => post[:Keywords].split(',')
58
+ }.delete_if { |k,v| v.nil? || v == ''}.to_yaml
59
+
60
+ # Write out the data and content to file.
61
+ File.open("_posts/#{name}", "w") do |f|
62
+ f.puts data
63
+ f.puts "---"
64
+ f.puts content
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,289 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class Tumblr < Importer
4
+ def self.require_deps
5
+ BuntoImport.require_with_fallback(%w[
6
+ rubygems
7
+ fileutils
8
+ open-uri
9
+ nokogiri
10
+ json
11
+ uri
12
+ time
13
+ bunto
14
+ ])
15
+ end
16
+
17
+ def self.specify_options(c)
18
+ c.option 'url', '--url URL', 'Tumblr URL'
19
+ c.option 'format', '--format FORMAT', 'Output format (default: "html")'
20
+ c.option 'grab_images', '--grab_images', 'Whether to grab images (default: false)'
21
+ c.option 'add_highlights', '--add_highlights', 'Whether to add highlights (default: false)'
22
+ c.option 'rewrite_urls', '--rewrite_urls', 'Whether to rewrite URLs (default: false)'
23
+ end
24
+
25
+ def self.process(options)
26
+ url = options.fetch('url')
27
+ format = options.fetch('format', "html")
28
+ grab_images = options.fetch('grab_images', false)
29
+ add_highlights = options.fetch('add_highlights', false)
30
+ rewrite_urls = options.fetch('rewrite_urls', false)
31
+
32
+ @grab_images = grab_images
33
+ FileUtils.mkdir_p "_posts/tumblr"
34
+ url += "/api/read/json/"
35
+ per_page = 50
36
+ posts = []
37
+ # Two passes are required so that we can rewrite URLs.
38
+ # First pass builds up an array of each post as a hash.
39
+ begin
40
+ current_page = (current_page || -1) + 1
41
+ feed_url = url + "?num=#{per_page}&start=#{current_page * per_page}"
42
+ puts "Fetching #{feed_url}"
43
+ feed = open(feed_url)
44
+ json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars.
45
+ blog = JSON.parse(json)
46
+ puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
47
+ batch = blog["posts"].map { |post| post_to_hash(post, format) }
48
+
49
+ # If we're rewriting, save the posts for later. Otherwise, go ahead and
50
+ # dump these to disk now
51
+ if rewrite_urls
52
+ posts += batch
53
+ else
54
+ batch.each {|post| write_post(post, format == "md", add_highlights)}
55
+ end
56
+
57
+ end until blog["posts"].size < per_page
58
+
59
+ # Rewrite URLs, create redirects and write out out posts if necessary
60
+ if rewrite_urls
61
+ posts = rewrite_urls_and_redirects posts
62
+ posts.each {|post| write_post(post, format == "md", add_highlights)}
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ # Writes a post out to disk
69
+ def self.write_post(post, use_markdown, add_highlights)
70
+ content = post[:content]
71
+
72
+ if content
73
+ if use_markdown
74
+ content = html_to_markdown content
75
+ if add_highlights
76
+ tumblr_url = URI.parse(post[:slug]).path
77
+ redirect_dir = tumblr_url.sub(/\//, "") + "/"
78
+ FileUtils.mkdir_p redirect_dir
79
+ content = add_syntax_highlights(content, redirect_dir)
80
+ end
81
+ end
82
+
83
+ File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
84
+ f.puts post[:header].to_yaml + "---\n" + content
85
+ end
86
+ end
87
+ end
88
+
89
+ # Converts each type of Tumblr post to a hash with all required
90
+ # data for Bunto.
91
+ def self.post_to_hash(post, format)
92
+ case post['type']
93
+ when "regular"
94
+ title = post["regular-title"]
95
+ content = post["regular-body"]
96
+ when "link"
97
+ title = post["link-text"] || post["link-url"]
98
+ content = "<a href=\"#{post["link-url"]}\">#{title}</a>"
99
+ unless post["link-description"].nil?
100
+ content << "<br/>" + post["link-description"]
101
+ end
102
+ when "photo"
103
+ title = post["slug"].gsub("-"," ")
104
+ if post["photos"].size > 1
105
+ content = ""
106
+ post["photos"].each do |post_photo|
107
+ photo = fetch_photo post_photo
108
+ content << photo + "<br/>"
109
+ content << post_photo["caption"]
110
+ end
111
+ else
112
+ content = fetch_photo post
113
+ end
114
+ content << "<br/>" + post["photo-caption"]
115
+ when "audio"
116
+ if !post["id3-title"].nil?
117
+ title = post["id3-title"]
118
+ content = post["audio-player"] + "<br/>" + post["audio-caption"]
119
+ else
120
+ title = post["audio-caption"]
121
+ content = post["audio-player"]
122
+ end
123
+ when "quote"
124
+ title = post["quote-text"]
125
+ content = "<blockquote>#{post["quote-text"]}</blockquote>"
126
+ unless post["quote-source"].nil?
127
+ content << "&#8212;" + post["quote-source"]
128
+ end
129
+ when "conversation"
130
+ title = post["conversation-title"]
131
+ content = "<section><dialog>"
132
+ post["conversation"].each do |line|
133
+ content << "<dt>#{line['label']}</dt><dd>#{line['phrase']}</dd>"
134
+ end
135
+ content << "</section></dialog>"
136
+ when "video"
137
+ title = post["video-title"]
138
+ content = post["video-player"]
139
+ unless post["video-caption"].nil?
140
+ unless content.nil?
141
+ content << "<br/>" + post["video-caption"]
142
+ else
143
+ content = post["video-caption"]
144
+ end
145
+ end
146
+ when "answer"
147
+ title = post["question"]
148
+ content = post["answer"]
149
+ end
150
+ date = Date.parse(post['date']).to_s
151
+ title = Nokogiri::HTML(title).text
152
+ title = "no title" if title.empty?
153
+ slug = if post["slug"] && post["slug"].strip != ""
154
+ post["slug"]
155
+ elsif title && title.downcase.gsub(/[^a-z0-9\-]/, '') != '' && title != 'no title'
156
+ slug = title.downcase.strip.gsub(' ', '-').gsub(/[^a-z0-9\-]/, '')
157
+ slug.length > 200 ? slug.slice(0..200) : slug
158
+ else
159
+ slug = post['id']
160
+ end
161
+ {
162
+ :name => "#{date}-#{slug}.#{format}",
163
+ :header => {
164
+ "layout" => "post",
165
+ "title" => title,
166
+ "date" => Time.parse(post['date']).xmlschema,
167
+ "tags" => (post["tags"] or []),
168
+ "tumblr_url" => post["url-with-slug"]
169
+ },
170
+ :content => content,
171
+ :url => post["url"],
172
+ :slug => post["url-with-slug"],
173
+ }
174
+ end
175
+
176
+ # Attempts to fetch the largest version of a photo available for a post.
177
+ # If that file fails, it tries the next smaller size until all available
178
+ # photo URLs are exhausted. If they all fail, the import is aborted.
179
+ def self.fetch_photo(post)
180
+ sizes = post.keys.map {|k| k.gsub("photo-url-", "").to_i}
181
+ sizes.sort! {|a,b| b <=> a}
182
+
183
+ ext_key, ext_val = post.find do |k,v|
184
+ k =~ /^photo-url-/ && v.split("/").last =~ /\./
185
+ end
186
+ ext = "." + ext_val.split(".").last
187
+
188
+ sizes.each do |size|
189
+ url = post["photo-url"] || post["photo-url-#{size}"]
190
+ next if url.nil?
191
+ begin
192
+ return "<img src=\"#{save_photo(url, ext)}\"/>"
193
+ rescue OpenURI::HTTPError => err
194
+ puts "Failed to grab photo"
195
+ end
196
+ end
197
+
198
+ abort "Failed to fetch photo for post #{post['url']}"
199
+ end
200
+
201
+ # Create a Hash of old urls => new urls, for rewriting and
202
+ # redirects, and replace urls in each post. Instantiate Bunto
203
+ # site/posts to get the correct permalink format.
204
+ def self.rewrite_urls_and_redirects(posts)
205
+ site = Bunto::Site.new(Bunto.configuration({}))
206
+ urls = Hash[posts.map { |post|
207
+ # Create an initial empty file for the post so that
208
+ # we can instantiate a post object.
209
+ File.open("_posts/tumblr/#{post[:name]}", "w")
210
+ tumblr_url = URI.parse(URI.encode(post[:slug])).path
211
+ bunto_url = Bunto::Post.new(site, Dir.pwd, "", "tumblr/" + post[:name]).url
212
+ redirect_dir = tumblr_url.sub(/\//, "") + "/"
213
+ FileUtils.mkdir_p redirect_dir
214
+ File.open(redirect_dir + "index.html", "w") do |f|
215
+ f.puts "<html><head><link rel=\"canonical\" href=\"" +
216
+ "#{bunto_url}\"><meta http-equiv=\"refresh\" content=\"0; " +
217
+ "url=#{bunto_url}\"></head><body></body></html>"
218
+ end
219
+ [tumblr_url, bunto_url]
220
+ }]
221
+ posts.map { |post|
222
+ urls.each do |tumblr_url, bunto_url|
223
+ post[:content].gsub!(/#{tumblr_url}/i, bunto_url)
224
+ end
225
+ post
226
+ }
227
+ end
228
+
229
+ # Convert preserving HTML tables as per the markdown docs.
230
+ def self.html_to_markdown(content)
231
+ preserve = ["table", "tr", "th", "td"]
232
+ preserve.each do |tag|
233
+ content.gsub!(/<#{tag}/i, "$$" + tag)
234
+ content.gsub!(/<\/#{tag}/i, "||" + tag)
235
+ end
236
+ content = Nokogiri::HTML(content.gsub("'", "''")).text
237
+ preserve.each do |tag|
238
+ content.gsub!("$$" + tag, "<" + tag)
239
+ content.gsub!("||" + tag, "</" + tag)
240
+ end
241
+ content
242
+ end
243
+
244
+ # Adds pygments highlight tags to code blocks in posts that use
245
+ # markdown format. This doesn't guess the language of the code
246
+ # block, so you should modify this to suit your own content.
247
+ # For example, my code block only contain Python and JavaScript,
248
+ # so I can assume the block is JavaScript if it contains a
249
+ # semi-colon.
250
+ def self.add_syntax_highlights(content, redirect_dir)
251
+ lines = content.split("\n")
252
+ block, indent, lang, start = false, /^ /, nil, nil
253
+ lines.each_with_index do |line, i|
254
+ if !block && line =~ indent
255
+ block = true
256
+ lang = "python"
257
+ start = i
258
+ elsif block
259
+ lang = "javascript" if line =~ /;$/
260
+ block = line =~ indent && i < lines.size - 1 # Also handle EOF
261
+ if !block
262
+ lines[start] = "{% highlight #{lang} %}"
263
+ lines[i - 1] = "{% endhighlight %}"
264
+ end
265
+ FileUtils.cp(redirect_dir + "index.html", redirect_dir + "../" + "index.html")
266
+ lines[i] = lines[i].sub(indent, "")
267
+ end
268
+ end
269
+ lines.join("\n")
270
+ end
271
+
272
+ def self.save_photo(url, ext)
273
+ if @grab_images
274
+ path = "tumblr_files/#{url.split('/').last}"
275
+ path += ext unless path =~ /#{ext}$/
276
+ FileUtils.mkdir_p "tumblr_files"
277
+
278
+ # Don't fetch if we've already cached this file
279
+ unless File.size? path
280
+ puts "Fetching photo #{url}"
281
+ File.open(path, "w") { |f| f.write(open(url).read) }
282
+ end
283
+ url = "/" + path
284
+ end
285
+ url
286
+ end
287
+ end
288
+ end
289
+ end