bunto-import 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,88 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class Typo < Importer
4
+ # This SQL *should* work for both MySQL and PostgreSQL.
5
+ SQL = <<-EOS
6
+ SELECT c.id id,
7
+ c.title title,
8
+ c.permalink slug,
9
+ c.body body,
10
+ c.extended extended,
11
+ c.published_at date,
12
+ c.state state,
13
+ c.keywords keywords,
14
+ COALESCE(tf.name, 'html') filter
15
+ FROM contents c
16
+ LEFT OUTER JOIN text_filters tf
17
+ ON c.text_filter_id = tf.id
18
+ EOS
19
+
20
+ def self.require_deps
21
+ BuntoImport.require_with_fallback(%w[
22
+ rubygems
23
+ sequel
24
+ fileutils
25
+ safe_yaml
26
+ ])
27
+ end
28
+
29
+ def self.specify_options(c)
30
+ c.option 'server', '--server TYPE', 'Server type ("mysql" or "postgres")'
31
+ c.option 'dbname', '--dbname DB', 'Database name'
32
+ c.option 'user', '--user USER', 'Database user name'
33
+ c.option 'password', '--password PW', "Database user's password (default: '')"
34
+ c.option 'host', '--host HOST', 'Database host name'
35
+ end
36
+
37
+ def self.process(options)
38
+ server = options.fetch('server')
39
+ dbname = options.fetch('dbname')
40
+ user = options.fetch('user')
41
+ pass = options.fetch('password', '')
42
+ host = options.fetch('host', "localhost")
43
+
44
+ FileUtils.mkdir_p '_posts'
45
+ case server.intern
46
+ when :postgres
47
+ db = Sequel.postgres(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
48
+ when :mysql
49
+ db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
50
+ else
51
+ raise "Unknown database server '#{server}'"
52
+ end
53
+ db[SQL].each do |post|
54
+ next unless post[:state] =~ /published/i
55
+
56
+ if post[:slug] == nil
57
+ post[:slug] = "no slug"
58
+ end
59
+
60
+ if post[:extended]
61
+ post[:body] << "\n<!-- more -->\n"
62
+ post[:body] << post[:extended]
63
+ end
64
+
65
+ name = [ sprintf("%.04d", post[:date].year),
66
+ sprintf("%.02d", post[:date].month),
67
+ sprintf("%.02d", post[:date].day),
68
+ post[:slug].strip ].join('-')
69
+
70
+ # Can have more than one text filter in this field, but we just want
71
+ # the first one for this.
72
+ name += '.' + post[:filter].split(' ')[0]
73
+
74
+ File.open("_posts/#{name}", 'w') do |f|
75
+ f.puts({ 'layout' => 'post',
76
+ 'title' => (post[:title] and post[:title].to_s.force_encoding('UTF-8')),
77
+ 'tags' => (post[:keywords] and post[:keywords].to_s.force_encoding('UTF-8')),
78
+ 'typo_id' => post[:id]
79
+ }.delete_if { |k, v| v.nil? || v == '' }.to_yaml)
80
+ f.puts '---'
81
+ f.puts post[:body].delete("\r")
82
+ end
83
+ end
84
+ end
85
+
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,372 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class WordPress < Importer
4
+
5
+ def self.require_deps
6
+ BuntoImport.require_with_fallback(%w[
7
+ rubygems
8
+ sequel
9
+ fileutils
10
+ safe_yaml
11
+ unidecode
12
+ ])
13
+ end
14
+
15
+ def self.specify_options(c)
16
+ c.option 'dbname', '--dbname DB', 'Database name (default: "")'
17
+ c.option 'socket', '--socket SOCKET', 'Database socket (default: "")'
18
+ c.option 'user', '--user USER', 'Database user name (default: "")'
19
+ c.option 'password', '--password PW', "Database user's password (default: "")"
20
+ c.option 'host', '--host HOST', 'Database host name (default: "localhost")'
21
+ c.option 'table_prefix', '--table_prefix PREFIX', 'Table prefix name (default: "wp_")'
22
+ c.option 'site_prefix', '--site_prefix PREFIX', 'Site prefix name (default: "")'
23
+ c.option 'clean_entities', '--clean_entities', 'Whether to clean entities (default: true)'
24
+ c.option 'comments', '--comments', 'Whether to import comments (default: true)'
25
+ c.option 'categories', '--categories', 'Whether to import categories (default: true)'
26
+ c.option 'tags', '--tags', 'Whether to import tags (default: true)'
27
+ c.option 'more_excerpt', '--more_excerpt', 'Whether to use more excerpt (default: true)'
28
+ c.option 'more_anchor', '--more_anchor', 'Whether to use more anchor (default: true)'
29
+ c.option 'status', '--status STATUS,STATUS2', Array, 'Array of allowed statuses (default: ["publish"], other options: "draft", "private", "revision")'
30
+ end
31
+
32
+ # Main migrator function. Call this to perform the migration.
33
+ #
34
+ # dbname:: The name of the database
35
+ # user:: The database user name
36
+ # pass:: The database user's password
37
+ # host:: The address of the MySQL database host. Default: 'localhost'
38
+ # socket:: The database socket's path
39
+ # options:: A hash table of configuration options.
40
+ #
41
+ # Supported options are:
42
+ #
43
+ # :table_prefix:: Prefix of database tables used by WordPress.
44
+ # Default: 'wp_'
45
+ # :site_prefix:: Prefix of database tables used by WordPress
46
+ # Multisite, eg: 2_.
47
+ # Default: ''
48
+ # :clean_entities:: If true, convert non-ASCII characters to HTML
49
+ # entities in the posts, comments, titles, and
50
+ # names. Requires the 'htmlentities' gem to
51
+ # work. Default: true.
52
+ # :comments:: If true, migrate post comments too. Comments
53
+ # are saved in the post's YAML front matter.
54
+ # Default: true.
55
+ # :categories:: If true, save the post's categories in its
56
+ # YAML front matter. Default: true.
57
+ # :tags:: If true, save the post's tags in its
58
+ # YAML front matter. Default: true.
59
+ # :more_excerpt:: If true, when a post has no excerpt but
60
+ # does have a <!-- more --> tag, use the
61
+ # preceding post content as the excerpt.
62
+ # Default: true.
63
+ # :more_anchor:: If true, convert a <!-- more --> tag into
64
+ # two HTML anchors with ids "more" and
65
+ # "more-NNN" (where NNN is the post number).
66
+ # Default: true.
67
+ # :extension:: Set the post extension. Default: "html"
68
+ # :status:: Array of allowed post statuses. Only
69
+ # posts with matching status will be migrated.
70
+ # Known statuses are :publish, :draft, :private,
71
+ # and :revision. If this is nil or an empty
72
+ # array, all posts are migrated regardless of
73
+ # status. Default: [:publish].
74
+ #
75
+ def self.process(opts)
76
+ options = {
77
+ :user => opts.fetch('user', ''),
78
+ :pass => opts.fetch('password', ''),
79
+ :host => opts.fetch('host', 'localhost'),
80
+ :socket => opts.fetch('socket', nil),
81
+ :dbname => opts.fetch('dbname', ''),
82
+ :table_prefix => opts.fetch('table_prefix', 'wp_'),
83
+ :site_prefix => opts.fetch('site_prefix', nil),
84
+ :clean_entities => opts.fetch('clean_entities', true),
85
+ :comments => opts.fetch('comments', true),
86
+ :categories => opts.fetch('categories', true),
87
+ :tags => opts.fetch('tags', true),
88
+ :more_excerpt => opts.fetch('more_excerpt', true),
89
+ :more_anchor => opts.fetch('more_anchor', true),
90
+ :extension => opts.fetch('extension', 'html'),
91
+ :status => opts.fetch('status', ['publish']).map(&:to_sym) # :draft, :private, :revision
92
+ }
93
+
94
+ if options[:clean_entities]
95
+ begin
96
+ require 'htmlentities'
97
+ rescue LoadError
98
+ STDERR.puts "Could not require 'htmlentities', so the " +
99
+ ":clean_entities option is now disabled."
100
+ options[:clean_entities] = false
101
+ end
102
+ end
103
+
104
+ FileUtils.mkdir_p("_posts")
105
+ FileUtils.mkdir_p("_drafts") if options[:status].include? :draft
106
+
107
+ db = Sequel.mysql2(options[:dbname], :user => options[:user], :password => options[:pass],
108
+ :socket => options[:socket], :host => options[:host], :encoding => 'utf8')
109
+
110
+ px = options[:table_prefix]
111
+ sx = options[:site_prefix]
112
+
113
+ page_name_list = {}
114
+
115
+ page_name_query = "
116
+ SELECT
117
+ posts.ID AS `id`,
118
+ posts.post_title AS `title`,
119
+ posts.post_name AS `slug`,
120
+ posts.post_parent AS `parent`
121
+ FROM #{px}#{sx}posts AS `posts`
122
+ WHERE posts.post_type = 'page'"
123
+
124
+ db[page_name_query].each do |page|
125
+ if !page[:slug] or page[:slug].empty?
126
+ page[:slug] = sluggify(page[:title])
127
+ end
128
+ page_name_list[ page[:id] ] = {
129
+ :slug => page[:slug],
130
+ :parent => page[:parent]
131
+ }
132
+ end
133
+
134
+ posts_query = "
135
+ SELECT
136
+ posts.ID AS `id`,
137
+ posts.guid AS `guid`,
138
+ posts.post_type AS `type`,
139
+ posts.post_status AS `status`,
140
+ posts.post_title AS `title`,
141
+ posts.post_name AS `slug`,
142
+ posts.post_date AS `date`,
143
+ posts.post_date_gmt AS `date_gmt`,
144
+ posts.post_content AS `content`,
145
+ posts.post_excerpt AS `excerpt`,
146
+ posts.comment_count AS `comment_count`,
147
+ users.display_name AS `author`,
148
+ users.user_login AS `author_login`,
149
+ users.user_email AS `author_email`,
150
+ users.user_url AS `author_url`
151
+ FROM #{px}#{sx}posts AS `posts`
152
+ LEFT JOIN #{px}users AS `users`
153
+ ON posts.post_author = users.ID"
154
+
155
+ if options[:status] and not options[:status].empty?
156
+ status = options[:status][0]
157
+ posts_query << "
158
+ WHERE posts.post_status = '#{status.to_s}'"
159
+ options[:status][1..-1].each do |status|
160
+ posts_query << " OR
161
+ posts.post_status = '#{status.to_s}'"
162
+ end
163
+ end
164
+
165
+ db[posts_query].each do |post|
166
+ process_post(post, db, options, page_name_list)
167
+ end
168
+ end
169
+
170
+
171
+ def self.process_post(post, db, options, page_name_list)
172
+ px = options[:table_prefix]
173
+ sx = options[:site_prefix]
174
+ extension = options[:extension]
175
+
176
+ title = post[:title]
177
+ if options[:clean_entities]
178
+ title = clean_entities(title)
179
+ end
180
+
181
+ slug = post[:slug]
182
+ if !slug or slug.empty?
183
+ slug = sluggify(title)
184
+ end
185
+
186
+ date = post[:date] || Time.now
187
+ name = "%02d-%02d-%02d-%s.%s" % [date.year, date.month, date.day,
188
+ slug, extension]
189
+ content = post[:content].to_s
190
+ if options[:clean_entities]
191
+ content = clean_entities(content)
192
+ end
193
+
194
+ excerpt = post[:excerpt].to_s
195
+
196
+ more_index = content.index(/<!-- *more *-->/)
197
+ more_anchor = nil
198
+ if more_index
199
+ if options[:more_excerpt] and
200
+ (post[:excerpt].nil? or post[:excerpt].empty?)
201
+ excerpt = content[0...more_index]
202
+ end
203
+ if options[:more_anchor]
204
+ more_link = "more"
205
+ content.sub!(/<!-- *more *-->/,
206
+ "<a id=\"more\"></a>" +
207
+ "<a id=\"more-#{post[:id]}\"></a>")
208
+ end
209
+ end
210
+
211
+ categories = []
212
+ tags = []
213
+
214
+ if options[:categories] or options[:tags]
215
+
216
+ cquery =
217
+ "SELECT
218
+ terms.name AS `name`,
219
+ ttax.taxonomy AS `type`
220
+ FROM
221
+ #{px}#{sx}terms AS `terms`,
222
+ #{px}#{sx}term_relationships AS `trels`,
223
+ #{px}#{sx}term_taxonomy AS `ttax`
224
+ WHERE
225
+ trels.object_id = '#{post[:id]}' AND
226
+ trels.term_taxonomy_id = ttax.term_taxonomy_id AND
227
+ terms.term_id = ttax.term_id"
228
+
229
+ db[cquery].each do |term|
230
+ if options[:categories] and term[:type] == "category"
231
+ if options[:clean_entities]
232
+ categories << clean_entities(term[:name])
233
+ else
234
+ categories << term[:name]
235
+ end
236
+ elsif options[:tags] and term[:type] == "post_tag"
237
+ if options[:clean_entities]
238
+ tags << clean_entities(term[:name])
239
+ else
240
+ tags << term[:name]
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+ comments = []
247
+
248
+ if options[:comments] and post[:comment_count].to_i > 0
249
+ cquery =
250
+ "SELECT
251
+ comment_ID AS `id`,
252
+ comment_author AS `author`,
253
+ comment_author_email AS `author_email`,
254
+ comment_author_url AS `author_url`,
255
+ comment_date AS `date`,
256
+ comment_date_gmt AS `date_gmt`,
257
+ comment_content AS `content`
258
+ FROM #{px}#{sx}comments
259
+ WHERE
260
+ comment_post_ID = '#{post[:id]}' AND
261
+ comment_approved != 'spam'"
262
+
263
+
264
+ db[cquery].each do |comment|
265
+
266
+ comcontent = comment[:content].to_s
267
+ if comcontent.respond_to?(:force_encoding)
268
+ comcontent.force_encoding("UTF-8")
269
+ end
270
+ if options[:clean_entities]
271
+ comcontent = clean_entities(comcontent)
272
+ end
273
+ comauthor = comment[:author].to_s
274
+ if options[:clean_entities]
275
+ comauthor = clean_entities(comauthor)
276
+ end
277
+
278
+ comments << {
279
+ 'id' => comment[:id].to_i,
280
+ 'author' => comauthor,
281
+ 'author_email' => comment[:author_email].to_s,
282
+ 'author_url' => comment[:author_url].to_s,
283
+ 'date' => comment[:date].to_s,
284
+ 'date_gmt' => comment[:date_gmt].to_s,
285
+ 'content' => comcontent,
286
+ }
287
+ end
288
+
289
+ comments.sort!{ |a,b| a['id'] <=> b['id'] }
290
+ end
291
+
292
+ # Get the relevant fields as a hash, delete empty fields and
293
+ # convert to YAML for the header.
294
+ data = {
295
+ 'layout' => post[:type].to_s,
296
+ 'status' => post[:status].to_s,
297
+ 'published' => post[:status].to_s == 'draft' ? nil : (post[:status].to_s == 'publish'),
298
+ 'title' => title.to_s,
299
+ 'author' => {
300
+ 'display_name'=> post[:author].to_s,
301
+ 'login' => post[:author_login].to_s,
302
+ 'email' => post[:author_email].to_s,
303
+ 'url' => post[:author_url].to_s,
304
+ },
305
+ 'author_login' => post[:author_login].to_s,
306
+ 'author_email' => post[:author_email].to_s,
307
+ 'author_url' => post[:author_url].to_s,
308
+ 'excerpt' => excerpt,
309
+ 'more_anchor' => more_anchor,
310
+ 'wordpress_id' => post[:id],
311
+ 'wordpress_url' => post[:guid].to_s,
312
+ 'date' => date.to_s,
313
+ 'date_gmt' => post[:date_gmt].to_s,
314
+ 'categories' => options[:categories] ? categories : nil,
315
+ 'tags' => options[:tags] ? tags : nil,
316
+ 'comments' => options[:comments] ? comments : nil,
317
+ }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
318
+
319
+ if post[:type] == 'page'
320
+ filename = page_path(post[:id], page_name_list) + "index.#{extension}"
321
+ FileUtils.mkdir_p(File.dirname(filename))
322
+ elsif post[:status] == 'draft'
323
+ filename = "_drafts/#{slug}.md"
324
+ else
325
+ filename = "_posts/#{name}"
326
+ end
327
+
328
+ # Write out the data and content to file
329
+ File.open(filename, "w") do |f|
330
+ f.puts data
331
+ f.puts "---"
332
+ f.puts Util.wpautop(content)
333
+ end
334
+ end
335
+
336
+
337
+ def self.clean_entities( text )
338
+ if text.respond_to?(:force_encoding)
339
+ text.force_encoding("UTF-8")
340
+ end
341
+ text = HTMLEntities.new.encode(text, :named)
342
+ # We don't want to convert these, it would break all
343
+ # HTML tags in the post and comments.
344
+ text.gsub!("&amp;", "&")
345
+ text.gsub!("&lt;", "<")
346
+ text.gsub!("&gt;", ">")
347
+ text.gsub!("&quot;", '"')
348
+ text.gsub!("&apos;", "'")
349
+ text.gsub!("/", "&#47;")
350
+ text
351
+ end
352
+
353
+
354
+ def self.sluggify( title )
355
+ title = title.to_ascii.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-")
356
+ end
357
+
358
+ def self.page_path( page_id, page_name_list )
359
+ if page_name_list.key?(page_id)
360
+ [
361
+ page_path(page_name_list[page_id][:parent],page_name_list),
362
+ page_name_list[page_id][:slug],
363
+ '/'
364
+ ].join("")
365
+ else
366
+ ""
367
+ end
368
+ end
369
+
370
+ end
371
+ end
372
+ end