bunto-import 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class Typo < Importer
4
+ # This SQL *should* work for both MySQL and PostgreSQL.
5
+ SQL = <<-EOS
6
+ SELECT c.id id,
7
+ c.title title,
8
+ c.permalink slug,
9
+ c.body body,
10
+ c.extended extended,
11
+ c.published_at date,
12
+ c.state state,
13
+ c.keywords keywords,
14
+ COALESCE(tf.name, 'html') filter
15
+ FROM contents c
16
+ LEFT OUTER JOIN text_filters tf
17
+ ON c.text_filter_id = tf.id
18
+ EOS
19
+
20
+ def self.require_deps
21
+ BuntoImport.require_with_fallback(%w[
22
+ rubygems
23
+ sequel
24
+ fileutils
25
+ safe_yaml
26
+ ])
27
+ end
28
+
29
+ def self.specify_options(c)
30
+ c.option 'server', '--server TYPE', 'Server type ("mysql" or "postgres")'
31
+ c.option 'dbname', '--dbname DB', 'Database name'
32
+ c.option 'user', '--user USER', 'Database user name'
33
+ c.option 'password', '--password PW', "Database user's password (default: '')"
34
+ c.option 'host', '--host HOST', 'Database host name'
35
+ end
36
+
37
+ def self.process(options)
38
+ server = options.fetch('server')
39
+ dbname = options.fetch('dbname')
40
+ user = options.fetch('user')
41
+ pass = options.fetch('password', '')
42
+ host = options.fetch('host', "localhost")
43
+
44
+ FileUtils.mkdir_p '_posts'
45
+ case server.intern
46
+ when :postgres
47
+ db = Sequel.postgres(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
48
+ when :mysql
49
+ db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
50
+ else
51
+ raise "Unknown database server '#{server}'"
52
+ end
53
+ db[SQL].each do |post|
54
+ next unless post[:state] =~ /published/i
55
+
56
+ if post[:slug] == nil
57
+ post[:slug] = "no slug"
58
+ end
59
+
60
+ if post[:extended]
61
+ post[:body] << "\n<!-- more -->\n"
62
+ post[:body] << post[:extended]
63
+ end
64
+
65
+ name = [ sprintf("%.04d", post[:date].year),
66
+ sprintf("%.02d", post[:date].month),
67
+ sprintf("%.02d", post[:date].day),
68
+ post[:slug].strip ].join('-')
69
+
70
+ # Can have more than one text filter in this field, but we just want
71
+ # the first one for this.
72
+ name += '.' + post[:filter].split(' ')[0]
73
+
74
+ File.open("_posts/#{name}", 'w') do |f|
75
+ f.puts({ 'layout' => 'post',
76
+ 'title' => (post[:title] and post[:title].to_s.force_encoding('UTF-8')),
77
+ 'tags' => (post[:keywords] and post[:keywords].to_s.force_encoding('UTF-8')),
78
+ 'typo_id' => post[:id]
79
+ }.delete_if { |k, v| v.nil? || v == '' }.to_yaml)
80
+ f.puts '---'
81
+ f.puts post[:body].delete("\r")
82
+ end
83
+ end
84
+ end
85
+
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,372 @@
1
+ module BuntoImport
2
+ module Importers
3
+ class WordPress < Importer
4
+
5
+ def self.require_deps
6
+ BuntoImport.require_with_fallback(%w[
7
+ rubygems
8
+ sequel
9
+ fileutils
10
+ safe_yaml
11
+ unidecode
12
+ ])
13
+ end
14
+
15
+ def self.specify_options(c)
16
+ c.option 'dbname', '--dbname DB', 'Database name (default: "")'
17
+ c.option 'socket', '--socket SOCKET', 'Database socket (default: "")'
18
+ c.option 'user', '--user USER', 'Database user name (default: "")'
19
+ c.option 'password', '--password PW', "Database user's password (default: "")"
20
+ c.option 'host', '--host HOST', 'Database host name (default: "localhost")'
21
+ c.option 'table_prefix', '--table_prefix PREFIX', 'Table prefix name (default: "wp_")'
22
+ c.option 'site_prefix', '--site_prefix PREFIX', 'Site prefix name (default: "")'
23
+ c.option 'clean_entities', '--clean_entities', 'Whether to clean entities (default: true)'
24
+ c.option 'comments', '--comments', 'Whether to import comments (default: true)'
25
+ c.option 'categories', '--categories', 'Whether to import categories (default: true)'
26
+ c.option 'tags', '--tags', 'Whether to import tags (default: true)'
27
+ c.option 'more_excerpt', '--more_excerpt', 'Whether to use more excerpt (default: true)'
28
+ c.option 'more_anchor', '--more_anchor', 'Whether to use more anchor (default: true)'
29
+ c.option 'status', '--status STATUS,STATUS2', Array, 'Array of allowed statuses (default: ["publish"], other options: "draft", "private", "revision")'
30
+ end
31
+
32
+ # Main migrator function. Call this to perform the migration.
33
+ #
34
+ # dbname:: The name of the database
35
+ # user:: The database user name
36
+ # pass:: The database user's password
37
+ # host:: The address of the MySQL database host. Default: 'localhost'
38
+ # socket:: The database socket's path
39
+ # options:: A hash table of configuration options.
40
+ #
41
+ # Supported options are:
42
+ #
43
+ # :table_prefix:: Prefix of database tables used by WordPress.
44
+ # Default: 'wp_'
45
+ # :site_prefix:: Prefix of database tables used by WordPress
46
+ # Multisite, eg: 2_.
47
+ # Default: ''
48
+ # :clean_entities:: If true, convert non-ASCII characters to HTML
49
+ # entities in the posts, comments, titles, and
50
+ # names. Requires the 'htmlentities' gem to
51
+ # work. Default: true.
52
+ # :comments:: If true, migrate post comments too. Comments
53
+ # are saved in the post's YAML front matter.
54
+ # Default: true.
55
+ # :categories:: If true, save the post's categories in its
56
+ # YAML front matter. Default: true.
57
+ # :tags:: If true, save the post's tags in its
58
+ # YAML front matter. Default: true.
59
+ # :more_excerpt:: If true, when a post has no excerpt but
60
+ # does have a <!-- more --> tag, use the
61
+ # preceding post content as the excerpt.
62
+ # Default: true.
63
+ # :more_anchor:: If true, convert a <!-- more --> tag into
64
+ # two HTML anchors with ids "more" and
65
+ # "more-NNN" (where NNN is the post number).
66
+ # Default: true.
67
+ # :extension:: Set the post extension. Default: "html"
68
+ # :status:: Array of allowed post statuses. Only
69
+ # posts with matching status will be migrated.
70
+ # Known statuses are :publish, :draft, :private,
71
+ # and :revision. If this is nil or an empty
72
+ # array, all posts are migrated regardless of
73
+ # status. Default: [:publish].
74
+ #
75
+ def self.process(opts)
76
+ options = {
77
+ :user => opts.fetch('user', ''),
78
+ :pass => opts.fetch('password', ''),
79
+ :host => opts.fetch('host', 'localhost'),
80
+ :socket => opts.fetch('socket', nil),
81
+ :dbname => opts.fetch('dbname', ''),
82
+ :table_prefix => opts.fetch('table_prefix', 'wp_'),
83
+ :site_prefix => opts.fetch('site_prefix', nil),
84
+ :clean_entities => opts.fetch('clean_entities', true),
85
+ :comments => opts.fetch('comments', true),
86
+ :categories => opts.fetch('categories', true),
87
+ :tags => opts.fetch('tags', true),
88
+ :more_excerpt => opts.fetch('more_excerpt', true),
89
+ :more_anchor => opts.fetch('more_anchor', true),
90
+ :extension => opts.fetch('extension', 'html'),
91
+ :status => opts.fetch('status', ['publish']).map(&:to_sym) # :draft, :private, :revision
92
+ }
93
+
94
+ if options[:clean_entities]
95
+ begin
96
+ require 'htmlentities'
97
+ rescue LoadError
98
+ STDERR.puts "Could not require 'htmlentities', so the " +
99
+ ":clean_entities option is now disabled."
100
+ options[:clean_entities] = false
101
+ end
102
+ end
103
+
104
+ FileUtils.mkdir_p("_posts")
105
+ FileUtils.mkdir_p("_drafts") if options[:status].include? :draft
106
+
107
+ db = Sequel.mysql2(options[:dbname], :user => options[:user], :password => options[:pass],
108
+ :socket => options[:socket], :host => options[:host], :encoding => 'utf8')
109
+
110
+ px = options[:table_prefix]
111
+ sx = options[:site_prefix]
112
+
113
+ page_name_list = {}
114
+
115
+ page_name_query = "
116
+ SELECT
117
+ posts.ID AS `id`,
118
+ posts.post_title AS `title`,
119
+ posts.post_name AS `slug`,
120
+ posts.post_parent AS `parent`
121
+ FROM #{px}#{sx}posts AS `posts`
122
+ WHERE posts.post_type = 'page'"
123
+
124
+ db[page_name_query].each do |page|
125
+ if !page[:slug] or page[:slug].empty?
126
+ page[:slug] = sluggify(page[:title])
127
+ end
128
+ page_name_list[ page[:id] ] = {
129
+ :slug => page[:slug],
130
+ :parent => page[:parent]
131
+ }
132
+ end
133
+
134
+ posts_query = "
135
+ SELECT
136
+ posts.ID AS `id`,
137
+ posts.guid AS `guid`,
138
+ posts.post_type AS `type`,
139
+ posts.post_status AS `status`,
140
+ posts.post_title AS `title`,
141
+ posts.post_name AS `slug`,
142
+ posts.post_date AS `date`,
143
+ posts.post_date_gmt AS `date_gmt`,
144
+ posts.post_content AS `content`,
145
+ posts.post_excerpt AS `excerpt`,
146
+ posts.comment_count AS `comment_count`,
147
+ users.display_name AS `author`,
148
+ users.user_login AS `author_login`,
149
+ users.user_email AS `author_email`,
150
+ users.user_url AS `author_url`
151
+ FROM #{px}#{sx}posts AS `posts`
152
+ LEFT JOIN #{px}users AS `users`
153
+ ON posts.post_author = users.ID"
154
+
155
+ if options[:status] and not options[:status].empty?
156
+ status = options[:status][0]
157
+ posts_query << "
158
+ WHERE posts.post_status = '#{status.to_s}'"
159
+ options[:status][1..-1].each do |status|
160
+ posts_query << " OR
161
+ posts.post_status = '#{status.to_s}'"
162
+ end
163
+ end
164
+
165
+ db[posts_query].each do |post|
166
+ process_post(post, db, options, page_name_list)
167
+ end
168
+ end
169
+
170
+
171
+ def self.process_post(post, db, options, page_name_list)
172
+ px = options[:table_prefix]
173
+ sx = options[:site_prefix]
174
+ extension = options[:extension]
175
+
176
+ title = post[:title]
177
+ if options[:clean_entities]
178
+ title = clean_entities(title)
179
+ end
180
+
181
+ slug = post[:slug]
182
+ if !slug or slug.empty?
183
+ slug = sluggify(title)
184
+ end
185
+
186
+ date = post[:date] || Time.now
187
+ name = "%02d-%02d-%02d-%s.%s" % [date.year, date.month, date.day,
188
+ slug, extension]
189
+ content = post[:content].to_s
190
+ if options[:clean_entities]
191
+ content = clean_entities(content)
192
+ end
193
+
194
+ excerpt = post[:excerpt].to_s
195
+
196
+ more_index = content.index(/<!-- *more *-->/)
197
+ more_anchor = nil
198
+ if more_index
199
+ if options[:more_excerpt] and
200
+ (post[:excerpt].nil? or post[:excerpt].empty?)
201
+ excerpt = content[0...more_index]
202
+ end
203
+ if options[:more_anchor]
204
+ more_link = "more"
205
+ content.sub!(/<!-- *more *-->/,
206
+ "<a id=\"more\"></a>" +
207
+ "<a id=\"more-#{post[:id]}\"></a>")
208
+ end
209
+ end
210
+
211
+ categories = []
212
+ tags = []
213
+
214
+ if options[:categories] or options[:tags]
215
+
216
+ cquery =
217
+ "SELECT
218
+ terms.name AS `name`,
219
+ ttax.taxonomy AS `type`
220
+ FROM
221
+ #{px}#{sx}terms AS `terms`,
222
+ #{px}#{sx}term_relationships AS `trels`,
223
+ #{px}#{sx}term_taxonomy AS `ttax`
224
+ WHERE
225
+ trels.object_id = '#{post[:id]}' AND
226
+ trels.term_taxonomy_id = ttax.term_taxonomy_id AND
227
+ terms.term_id = ttax.term_id"
228
+
229
+ db[cquery].each do |term|
230
+ if options[:categories] and term[:type] == "category"
231
+ if options[:clean_entities]
232
+ categories << clean_entities(term[:name])
233
+ else
234
+ categories << term[:name]
235
+ end
236
+ elsif options[:tags] and term[:type] == "post_tag"
237
+ if options[:clean_entities]
238
+ tags << clean_entities(term[:name])
239
+ else
240
+ tags << term[:name]
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+ comments = []
247
+
248
+ if options[:comments] and post[:comment_count].to_i > 0
249
+ cquery =
250
+ "SELECT
251
+ comment_ID AS `id`,
252
+ comment_author AS `author`,
253
+ comment_author_email AS `author_email`,
254
+ comment_author_url AS `author_url`,
255
+ comment_date AS `date`,
256
+ comment_date_gmt AS `date_gmt`,
257
+ comment_content AS `content`
258
+ FROM #{px}#{sx}comments
259
+ WHERE
260
+ comment_post_ID = '#{post[:id]}' AND
261
+ comment_approved != 'spam'"
262
+
263
+
264
+ db[cquery].each do |comment|
265
+
266
+ comcontent = comment[:content].to_s
267
+ if comcontent.respond_to?(:force_encoding)
268
+ comcontent.force_encoding("UTF-8")
269
+ end
270
+ if options[:clean_entities]
271
+ comcontent = clean_entities(comcontent)
272
+ end
273
+ comauthor = comment[:author].to_s
274
+ if options[:clean_entities]
275
+ comauthor = clean_entities(comauthor)
276
+ end
277
+
278
+ comments << {
279
+ 'id' => comment[:id].to_i,
280
+ 'author' => comauthor,
281
+ 'author_email' => comment[:author_email].to_s,
282
+ 'author_url' => comment[:author_url].to_s,
283
+ 'date' => comment[:date].to_s,
284
+ 'date_gmt' => comment[:date_gmt].to_s,
285
+ 'content' => comcontent,
286
+ }
287
+ end
288
+
289
+ comments.sort!{ |a,b| a['id'] <=> b['id'] }
290
+ end
291
+
292
+ # Get the relevant fields as a hash, delete empty fields and
293
+ # convert to YAML for the header.
294
+ data = {
295
+ 'layout' => post[:type].to_s,
296
+ 'status' => post[:status].to_s,
297
+ 'published' => post[:status].to_s == 'draft' ? nil : (post[:status].to_s == 'publish'),
298
+ 'title' => title.to_s,
299
+ 'author' => {
300
+ 'display_name'=> post[:author].to_s,
301
+ 'login' => post[:author_login].to_s,
302
+ 'email' => post[:author_email].to_s,
303
+ 'url' => post[:author_url].to_s,
304
+ },
305
+ 'author_login' => post[:author_login].to_s,
306
+ 'author_email' => post[:author_email].to_s,
307
+ 'author_url' => post[:author_url].to_s,
308
+ 'excerpt' => excerpt,
309
+ 'more_anchor' => more_anchor,
310
+ 'wordpress_id' => post[:id],
311
+ 'wordpress_url' => post[:guid].to_s,
312
+ 'date' => date.to_s,
313
+ 'date_gmt' => post[:date_gmt].to_s,
314
+ 'categories' => options[:categories] ? categories : nil,
315
+ 'tags' => options[:tags] ? tags : nil,
316
+ 'comments' => options[:comments] ? comments : nil,
317
+ }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
318
+
319
+ if post[:type] == 'page'
320
+ filename = page_path(post[:id], page_name_list) + "index.#{extension}"
321
+ FileUtils.mkdir_p(File.dirname(filename))
322
+ elsif post[:status] == 'draft'
323
+ filename = "_drafts/#{slug}.md"
324
+ else
325
+ filename = "_posts/#{name}"
326
+ end
327
+
328
+ # Write out the data and content to file
329
+ File.open(filename, "w") do |f|
330
+ f.puts data
331
+ f.puts "---"
332
+ f.puts Util.wpautop(content)
333
+ end
334
+ end
335
+
336
+
337
+ def self.clean_entities( text )
338
+ if text.respond_to?(:force_encoding)
339
+ text.force_encoding("UTF-8")
340
+ end
341
+ text = HTMLEntities.new.encode(text, :named)
342
+ # We don't want to convert these, it would break all
343
+ # HTML tags in the post and comments.
344
+ text.gsub!("&amp;", "&")
345
+ text.gsub!("&lt;", "<")
346
+ text.gsub!("&gt;", ">")
347
+ text.gsub!("&quot;", '"')
348
+ text.gsub!("&apos;", "'")
349
+ text.gsub!("/", "&#47;")
350
+ text
351
+ end
352
+
353
+
354
+ def self.sluggify( title )
355
+ title = title.to_ascii.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-")
356
+ end
357
+
358
+ def self.page_path( page_id, page_name_list )
359
+ if page_name_list.key?(page_id)
360
+ [
361
+ page_path(page_name_list[page_id][:parent],page_name_list),
362
+ page_name_list[page_id][:slug],
363
+ '/'
364
+ ].join("")
365
+ else
366
+ ""
367
+ end
368
+ end
369
+
370
+ end
371
+ end
372
+ end