sutty-migration 0.2.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 046cf945de1c0736e329224151a4b331c87e44cab6e6c2bc1e69f22b7639fe68
4
- data.tar.gz: 58effbee202ab51c7ff1ed4e8e98498cd5a65f618c6ae6fc4f924fe6aed2e0e1
3
+ metadata.gz: e94ad861c92188564cbac8821283cd8d6229ae2c85f44ebc24dbc235228de794
4
+ data.tar.gz: e2ae77d06641891aeb6572693536fcfb2e436e2691abf8994cdc540e92104780
5
5
  SHA512:
6
- metadata.gz: a21cb549bddd9bc55218c0633932300811e547d2d4cfde2525fbcdcaf8a2ff3bb89f0542813b46fdb9b90c9d739166ce37e303fb4de76b918b52bee9402fcb6d
7
- data.tar.gz: 544f18359b4e9996c07f6828643bb8d4856d55457b4a6134b2ef2cedf01d4471d294effc6024598ef66f9b1d2258345f44ac370e1678166a2b9d19b5c4a55e74
6
+ metadata.gz: 9b3382a28169ae769d3993ef5fa7c01421fb8eb951b27e383ec084e22951242a110e11704f8f38ec198740d1673b096044d7f2bf00e63daa72294feffab04cc9
7
+ data.tar.gz: 661841af686da59fa2ebc08be9c0d992e5fb3965e25a6c4751e0ff192ddd65dde2cc426c0fcd610ae1a19fd494b0e1c5d245459848ab6c768052480a6a96af0f
data/README.md CHANGED
@@ -23,6 +23,7 @@ Add the plugin to your `_config.yml`:
23
23
  ```yaml
24
24
  plugins:
25
25
  - sutty-migration
26
+ array_separator: ','
26
27
  ```
27
28
 
28
29
  Compile a CSV file with the following required fields:
@@ -114,6 +115,55 @@ Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
114
115
  end
115
116
  ```
116
117
 
118
+ ### WordPress XML
119
+
120
+ If you have the XML dump from a WordPress site, you can migrate content
121
+ by writing a migration plugin.
122
+
123
+ ```ruby
124
+ # frozen_string_literal: true
125
+
126
+ require 'sutty_migration/jekyll/document_creator'
127
+ require 'sutty_migration/wordpress_xml'
128
+ require 'jekyll-write-and-commit-changes'
129
+ require 'securerandom'
130
+
131
+ # Run after reading the site
132
+ Jekyll::Hooks.register :site, :post_read do |site|
133
+ # Put the XML dump at _files/wordpress.xml
134
+ xml = SuttyMigration::WordpressXml.new site: site, file: '_files/wordpress.xml'
135
+
136
+ # Download all files
137
+ xml.attachments.values.map(&:download)
138
+
139
+ # Migrate posts. You can move metadata around and recover
140
+ # relationships or any info your theme requires.
141
+ xml.posts.values.each do |post|
142
+ # Update documents already migrated.
143
+ doc = Jekyll::Document.find_or_create(site: site, collection: locale, title: post.title, slug: post.slug, date: post.date)
144
+ # Don't change the UUIDv4
145
+ d.data['uuid'] ||= SecureRandom.uuid
146
+ d.data['draft'] = post.draft?
147
+ d.data['layout'] = 'post'
148
+ d.data['last_modified_at'] = post.last_modified_at
149
+
150
+ d.data['categories'] = post.categories.map { |c| c[:title] }
151
+ d.data['tags'] = post.tags.map { |t| t[:title] }
152
+
153
+ d.data['author'] = post.author[:email]
154
+ d.data['description'] = post.description
155
+ d.content = post.content
156
+
157
+ doc.save
158
+ rescue => e
159
+ Jekyll.logger.warn "Couldn't migrate #{post.title}"
160
+ end
161
+
162
+ exit # Stop here
163
+ end
164
+ ```
165
+
166
+
117
167
  ## Contributing
118
168
 
119
169
  Bug reports and pull requests are welcome on 0xacab.org at
@@ -13,6 +13,8 @@ require_relative 'jekyll/document_creator'
13
13
  Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
14
14
  documents = site.documents
15
15
 
16
+ array_separator = site.config.fetch('array_separator', ',')
17
+
16
18
  site.data['layouts']&.each do |name, layout|
17
19
  site.data.dig('migration', name)&.each do |row|
18
20
  row['date'] = Jekyll::Utils.parse_date(row['date']) unless row['date'].blank?
@@ -24,41 +26,47 @@ Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
24
26
  end
25
27
  end
26
28
 
27
- document ||= Jekyll::Document.create(site: site, collection: 'posts', **row.slice(*%w[date slug title]).transform_keys(&:to_sym))
29
+ document ||= begin
30
+ data = row.slice(*%w[date slug title]).transform_keys(&:to_sym)
31
+ Jekyll::Document.find_or_create(site: site, collection: 'posts', **data)
32
+ end
33
+ next unless document
28
34
 
29
35
  row.each do |attribute, value|
30
- next unless value.blank?
36
+ next if value.nil? || value.blank?
37
+
38
+ value.strip! if value.is_a? String
31
39
 
32
40
  row[attribute] =
33
41
  case layout.dig(attribute, 'type')
34
- when 'string' then value
35
- when 'text' then value
36
- when 'tel' then value
37
- # TODO: validate
38
- when 'color' then value
39
- when 'date' then Jekyll::Utils.parse_date(value)
40
- # TODO: validate
41
- when 'email' then value
42
- # TODO: validate
43
- when 'url' then value
44
- when 'content' then value
45
- when 'markdown_content' then value
46
- when 'markdown' then value
47
- when 'number' then value.to_i
48
- when 'order' then value.to_i
49
- when 'boolean' then !value.strip.empty?
50
- when 'array' then value.split(',').map(&:strip)
51
- # TODO: process values from the default array
52
- when 'predefined_array' then value.split(',').map(&:strip)
53
- when 'image' then { 'path' => value, 'description' => '' }
54
- when 'file' then { 'path' => value, 'description' => '' }
55
- when 'geo' then %w[lat lng].zip(value.split(',', 2).map(&:to_f)).to_h
56
- when 'belongs_to' then value
57
- when 'has_many' then value.split(',').map(&:strip)
58
- when 'has_and_belongs_to_many' then value.split(',').map(&:strip)
59
- when 'related_posts' then value.split(',').map(&:strip)
60
- when 'locales' then value.split(',').map(&:strip)
61
- else value
42
+ when 'string' then value.tr("\n", ' ').squeeze(' ')
43
+ when 'text' then value.gsub("\n", "\n\n")
44
+ when 'tel' then value.tr("\n", ' ').squeeze(' ')
45
+ # TODO: validate
46
+ when 'color' then value.tr("\n", ' ').squeeze(' ')
47
+ when 'date' then Jekyll::Utils.parse_date(value)
48
+ # TODO: validate
49
+ when 'email' then value.tr("\n", ' ').squeeze(' ')
50
+ # TODO: validate
51
+ when 'url' then value.tr("\n", ' ').squeeze(' ')
52
+ when 'content' then value.gsub("\n", "\n\n")
53
+ when 'markdown_content' then value.gsub("\n", "\n\n")
54
+ when 'markdown' then value.gsub("\n", "\n\n")
55
+ when 'number' then value.to_i
56
+ when 'order' then value.to_i
57
+ when 'boolean' then !value.strip.empty?
58
+ when 'array' then value.split(array_separator).map(&:strip)
59
+ # TODO: process values from the default array
60
+ when 'predefined_array' then value.split(array_separator).map(&:strip)
61
+ when 'image' then { 'path' => value, 'description' => '' }
62
+ when 'file' then { 'path' => value, 'description' => '' }
63
+ when 'geo' then %w[lat lng].zip(value.split(array_separator, 2).map(&:to_f)).to_h
64
+ when 'belongs_to' then value
65
+ when 'has_many' then value.split(array_separator).map(&:strip)
66
+ when 'has_and_belongs_to_many' then value.split(array_separator).map(&:strip)
67
+ when 'related_posts' then value.split(array_separator).map(&:strip)
68
+ when 'locales' then value.split(array_separator).map(&:strip)
69
+ else value
62
70
  end
63
71
  end
64
72
 
@@ -7,31 +7,75 @@ module SuttyMigration
7
7
  module Jekyll
8
8
  module DocumentCreator
9
9
  class DocumentExists < ArgumentError; end
10
+
10
11
  def self.included(base)
11
12
  base.class_eval do
13
+ class << self
14
+ # Creates a new document in a collection or fails if it already
15
+ # exists.
16
+ #
17
+ # @param :site [Jekyll::Site] Jekyll site
18
+ # @param :date [Time] Post date
19
+ # @param :title [String] Post title
20
+ # @param :slug [String] Post slug, slugified title if empty
21
+ # @param :collection [Jekyll::Collection,String] Collection label or collection
22
+ # @return [Jekyll::Document] A new document
23
+ def create(site:, date:, title:, collection:, slug: nil)
24
+ collection = site.collections[collection] if collection.is_a? String
25
+ slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
26
+ basename = "#{date.strftime('%F')}-#{slug}.markdown"
27
+ path = File.join(collection.directory, basename)
12
28
 
13
- # Creates a new document in a collection or fails if it already
14
- # exists.
15
- #
16
- # @param :site [Jekyll::Site] Jekyll site
17
- # @param :date [Time] Post date
18
- # @param :title [String] Post title
19
- # @param :slug [String] Post slug, slugified title if empty
20
- # @param :collection [Jekyll::Collection,String] Collection label or collection
21
- # @return [Jekyll::Document] A new document
22
- def self.create(site:, date:, title:, slug: nil, collection:)
23
- collection = site.collections[collection] if collection.is_a? String
24
- slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
25
- basename = "#{date.strftime('%F')}-#{slug}.markdown"
26
- path = File.join(collection.directory, basename)
27
-
28
- raise DocumentExists, "#{path} already exists" if File.exist? path
29
-
30
- ::Jekyll::Document.new(path, site: site, collection: collection).tap do |document|
31
- collection.docs << document
32
- document.data['title'] = title
33
- end
34
- end
29
+ raise DocumentExists, "#{path} already exists" if File.exist? path
30
+
31
+ ::Jekyll::Document.new(path, site: site, collection: collection).tap do |document|
32
+ collection.docs << document
33
+ document.data['title'] = title
34
+ end
35
+ end
36
+
37
+ # Finds a document by its relative path or creates it if it
38
+ # doesn't exist. Helpful for idempotent migrations (create or
39
+ # update actions)
40
+ #
41
+ # @param :site [Jekyll::Site] Jekyll site
42
+ # @param :date [Time] Post date
43
+ # @param :title [String] Post title
44
+ # @param :slug [String] Post slug, slugified title if empty
45
+ # @param :collection [Jekyll::Collection,String] Collection label or collection
46
+ # @return [Jekyll::Document] The found document or a new one
47
+ def find_or_create(site:, date:, title:, collection:, slug: nil)
48
+ collection = site.collections[collection] if collection.is_a? String
49
+ slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
50
+ basename = "#{date.strftime('%F')}-#{slug}.markdown"
51
+ path = File.join(collection.relative_directory, basename)
52
+
53
+ return find(site: site, relative_path: path) if File.exist?(path)
54
+
55
+ create(site: site, date: date, title: title, slug: slug, collection: collection)
56
+ end
57
+
58
+ # Finds a document by its relative path
59
+ #
60
+ # @param :site [Jekyll::Site]
61
+ # @param :relative_path [String]
62
+ # @return [Jekyll::Document,Nil]
63
+ def find(site:, relative_path:)
64
+ indexed_documents_by_relative_path(site)[relative_path]
65
+ end
66
+
67
+ # Index documents by relative path for faster finding
68
+ #
69
+ # @param [Jekyll::Site]
70
+ # @return [Hash]
71
+ def indexed_documents_by_relative_path(site)
72
+ @indexed_documents_by_relative_path ||= site.documents.reduce({}) do |idx, doc|
73
+ idx.tap do |i|
74
+ i[doc.relative_path] = doc
75
+ end
76
+ end
77
+ end
78
+ end
35
79
  end
36
80
  end
37
81
  end
@@ -38,15 +38,21 @@ module SuttyMigration
38
38
  #
39
39
  # @return [Hash] { "ID" => SuttyMigration::Wordpress }
40
40
  def blogs
41
- @blogs ||= wp["select blog_id as id, domain, path from #{prefix}blogs"].to_a.map do |blog|
41
+ @blogs ||= wp["select * from #{prefix}blogs"].to_a.map do |blog|
42
42
  url = "https://#{blog[:domain]}#{blog[:path]}"
43
- pfx = "#{prefix}#{blog[:id]}_" if blog[:id] > 1
43
+ pfx = "#{prefix}#{blog[:blog_id]}_" if blog[:blog_id] > 1
44
44
  pfx ||= prefix
45
45
 
46
- [ blog[:id], self.class.new(site: site, url: url, prefix: pfx, database: database, limit: limit, multisite: self) ]
46
+ [blog[:blog_id],
47
+ blog.merge(db: self.class.new(site: site, url: url, prefix: pfx, database: database, limit: limit,
48
+ multisite: self))]
47
49
  end.to_h
48
50
  end
49
51
 
52
+ def options
53
+ @options ||= wp["select option_name, option_value from #{prefix}options"].to_a.map(&:values).to_h.transform_keys(&:to_sym)
54
+ end
55
+
50
56
  # Open the database.
51
57
  #
52
58
  # @return [Sequel::SQLite::Database]
@@ -90,11 +96,11 @@ module SuttyMigration
90
96
  end
91
97
 
92
98
  Faraday.get(url) do |req|
93
- req.options.on_data = Proc.new do |chunk, downloaded_bytes|
99
+ req.options.on_data = proc do |chunk, downloaded_bytes|
94
100
  f.write chunk
95
101
 
96
102
  if progress
97
- progress.progress = (downloaded_bytes > content_length) ? content_length : downloaded_bytes
103
+ progress.progress = downloaded_bytes > content_length ? content_length : downloaded_bytes
98
104
  end
99
105
  end
100
106
  end
@@ -126,10 +132,31 @@ module SuttyMigration
126
132
  p.map do |post|
127
133
  # Sequel parses dates on localtime
128
134
  post[:date] = ::Jekyll::Utils.parse_date(post[:date]) unless post[:date].blank?
129
- post[:last_modified_at] = ::Jekyll::Utils.parse_date(post[:last_modified_at]) unless post[:last_modified_at].blank?
135
+ unless post[:last_modified_at].blank?
136
+ post[:last_modified_at] =
137
+ ::Jekyll::Utils.parse_date(post[:last_modified_at])
138
+ end
130
139
 
131
- post[:front_matter] = JSON.parse(post[:front_matter]).transform_keys(&:to_sym) unless post[:front_matter].blank?
132
- post[:terms] = JSON.parse(post[:terms]).transform_keys(&:to_sym) unless post[:terms].blank?
140
+ post[:front_matter] =
141
+ begin
142
+ unless post[:front_matter].blank?
143
+ JSON.parse(post[:front_matter]).transform_keys(&:to_sym).transform_values do |v|
144
+ v.size == 1 ? v.first : v
145
+ end
146
+ end
147
+ rescue JSON::ParserError
148
+ {}
149
+ end
150
+ post[:terms] =
151
+ begin
152
+ unless post[:terms].blank?
153
+ JSON.parse(post[:terms]).transform_keys(&:to_sym).transform_values do |v|
154
+ v.size == 1 ? v.first : v
155
+ end
156
+ end
157
+ rescue JSON::ParserError
158
+ {}
159
+ end
133
160
  end
134
161
  end
135
162
  end
@@ -163,7 +190,7 @@ module SuttyMigration
163
190
  <<~EOQ
164
191
  select
165
192
  u.*
166
- #{", json_group_object(m.meta_key, m.meta_value) as meta" if with_meta}
193
+ #{', json_group_object(m.meta_key, m.meta_value) as meta' if with_meta}
167
194
  from #{pfx}users as u
168
195
  #{"left join #{pfx}usermeta as m on m.user_id = u.id" if with_meta}
169
196
  group by u.id
@@ -199,31 +226,56 @@ module SuttyMigration
199
226
  p.menu_order as menu_order,
200
227
  p.post_mime_type as mime_type,
201
228
  p.comment_count as comment_count
202
- #{", json_group_object(f.meta_key, f.meta_value) as front_matter" if with_meta}
203
- #{", t.terms as terms" if with_meta}
229
+ #{', f.front_matter as front_matter' if with_meta}
230
+ #{', t.terms as terms' if with_meta}
204
231
  from #{prefix}posts as p
205
- left join #{prefix}postmeta as f on p.ID = f.post_id
206
- #{"left join (#{terms_query(layout: layout)}) as t on t.id = p.ID" if with_meta}
232
+ #{"left join (#{meta_query}) as f on f.post_id = p.ID" if with_meta}
233
+ #{"left join (#{terms_query}) as t on t.post_id = p.ID" if with_meta}
207
234
  #{"where p.post_type = '#{layout}'" if layout}
208
235
  group by p.ID
209
236
  EOQ
210
237
  end
211
238
 
239
+ # Recover the post meta as a JSON object with multiple values
240
+ # converted to arrays
241
+ #
242
+ # @return [String]
243
+ def meta_query
244
+ <<~EOQ
245
+ select
246
+ post_id,
247
+ json_group_object(meta_key, json(meta_values)) as front_matter
248
+ from (
249
+ select
250
+ post_id,
251
+ meta_key,
252
+ json_group_array(meta_value) as meta_values
253
+ from #{prefix}postmeta
254
+ group by post_id, meta_key
255
+ )
256
+ group by post_id
257
+ EOQ
258
+ end
259
+
212
260
  # Term taxonomy query
213
261
  #
214
262
  # @param :layout [String] Layout name
215
263
  # @return [String]
216
- def terms_query(layout: nil)
264
+ def terms_query
217
265
  <<~EOQ
218
266
  select
219
- p.ID as id,
220
- json_group_object(tt.taxonomy, t.name) as terms
221
- from #{prefix}posts as p
222
- left join #{prefix}term_relationships as r on r.object_id = p.ID
223
- left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
224
- left join #{prefix}terms as t on t.term_id = tt.term_id
225
- #{"where p.post_type = '#{layout}'" if layout}
226
- group by p.ID
267
+ post_id,
268
+ json_group_object(taxonomy, json(terms)) as terms
269
+ from (
270
+ select
271
+ r.object_id as post_id,
272
+ tt.taxonomy,
273
+ json_group_array(t.name) as terms
274
+ from #{prefix}term_relationships as r
275
+ left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
276
+ left join #{prefix}terms as t on t.term_id = tt.term_id
277
+ group by r.object_id)
278
+ group by post_id
227
279
  EOQ
228
280
  end
229
281
  end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'post'
4
+ require 'php-serialize'
5
+ require 'faraday'
6
+ require 'progressbar'
7
+
8
+ module SuttyMigration
9
+ class WordpressXml
10
+ # Represents an attachment or uploaded file.
11
+ class Attachment < Post
12
+ # File URL
13
+ #
14
+ # @return [String]
15
+ def attachment_url
16
+ @attachment_url ||= attribute_value 'attachment_url'
17
+ end
18
+
19
+ # File destination
20
+ #
21
+ # @return [String]
22
+ def dest
23
+ @dest ||= URI(attachment_url).path.sub(%r{\A/}, '')
24
+ end
25
+
26
+ # Metadata, with file information as a Hash
27
+ #
28
+ # @return [Hash]
29
+ def meta
30
+ super.tap do |m|
31
+ m['_wp_attachment_metadata'] = PHP.unserialize m['_wp_attachment_metadata']
32
+ end
33
+ end
34
+
35
+ # Download the file if it doesn't exist. Optionally show a
36
+ # progress bar.
37
+ #
38
+ # @param :progress [Boolean]
39
+ # @return [Boolean]
40
+ def download(progress: true)
41
+ return true if File.exist? dest
42
+
43
+ ::Jekyll.logger.info "Downloading #{dest}"
44
+
45
+ FileUtils.mkdir_p File.dirname(dest)
46
+
47
+ File.open(dest, 'w') do |f|
48
+ if progress
49
+ head = Faraday.head(attachment_url)
50
+ content_length = head.headers['content-length'].to_i
51
+ progress = ProgressBar.create(title: File.basename(dest), total: content_length, output: $stderr)
52
+ end
53
+
54
+ Faraday.get(attachment_url) do |req|
55
+ req.options.on_data = proc do |chunk, downloaded_bytes|
56
+ f.write chunk
57
+
58
+ if progress
59
+ progress.progress = downloaded_bytes > content_length ? content_length : downloaded_bytes
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ File.exist? dest
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'wordpress_formatting/wpautop'
4
+ require 'jekyll/utils'
5
+
6
+ module SuttyMigration
7
+ class WordpressXml
8
+ # Represents a WordPress post
9
+ class Post
10
+ attr_reader :wordpress, :item
11
+
12
+ # @param :wordpress [SuttyMigration::WordpressXml]
13
+ # @param :item [Nokogiri::XML::Element]
14
+ def initialize(wordpress:, item:)
15
+ @wordpress = wordpress
16
+ @item = item
17
+ end
18
+
19
+ def inspect
20
+ "#<SuttyMigration::WordpressXml::Post title=\"#{title}\">"
21
+ end
22
+
23
+ # Post ID
24
+ #
25
+ # @return [Integer]
26
+ def id
27
+ @id ||= attribute_value('post_id').to_i
28
+ end
29
+
30
+ # Permalink. Absolute URL to the post.
31
+ #
32
+ # @return [String]
33
+ def permalink
34
+ @permalink ||= attribute_value('link').sub(wordpress.url, '')
35
+ end
36
+
37
+ # Title
38
+ #
39
+ # @return [String]
40
+ def title
41
+ @title ||= attribute_value('title')
42
+ end
43
+
44
+ # Description
45
+ #
46
+ # @return [String]
47
+ def description
48
+ @description ||= attribute_value('description')
49
+ end
50
+
51
+ # Slug ("post name")
52
+ #
53
+ # @return [String]
54
+ def slug
55
+ @slug ||= attribute_value('post_name')
56
+ end
57
+
58
+ # Publication date.
59
+ #
60
+ # WordPress can store this date in three different fields and
61
+ # sometimes they come empty or invalid.
62
+ #
63
+ # @return [Time]
64
+ def date
65
+ @date ||= %w[pubDate post_date_gmt post_date].map do |date_attr|
66
+ ::Jekyll::Utils.parse_date attribute_value(date_attr)
67
+ rescue StandardError
68
+ end.compact.first
69
+ end
70
+
71
+ # Modification date.
72
+ #
73
+ # @return [Time]
74
+ def last_modified_at
75
+ @last_modified_at ||= ::Jekyll::Utils.parse_date attribute_value('post_modified_gmt')
76
+ end
77
+
78
+ # Content as HTML, with site URL removed.
79
+ #
80
+ # @return [String]
81
+ def content
82
+ @content ||= WordpressFormatting::Wpautop.wpautop(attribute_value('encoded')).gsub(
83
+ / (href|src)="#{wordpress.url}/, ' \\1="'
84
+ )
85
+ end
86
+
87
+ # Author attributes.
88
+ #
89
+ # @return [Hash]
90
+ def author
91
+ @author ||= wordpress.authors[attribute_value('creator')]
92
+ end
93
+
94
+ # Post password. Use with jekyll-crypto.
95
+ #
96
+ # @return [String]
97
+ def password
98
+ @password ||= attribute_value 'post_password'
99
+ end
100
+
101
+ # Tags with attributes.
102
+ #
103
+ # @return [Hash]
104
+ def tags
105
+ @tags ||= item.css('category').select do |c|
106
+ c[:domain] == 'post_tag'
107
+ end.map do |c|
108
+ wordpress.tags[c[:nicename]]
109
+ end
110
+ end
111
+
112
+ # Categories with attributes.
113
+ #
114
+ # @return [Hash]
115
+ def categories
116
+ @categories ||= item.css('category').select do |c|
117
+ c[:domain] == 'category'
118
+ end.map do |c|
119
+ wordpress.categories[c[:nicename]]
120
+ end
121
+ end
122
+
123
+ # Metadata. Plugins store useful information here. Duplicated
124
+ # keys are returned as an Array of values.
125
+ #
126
+ # @return [Hash]
127
+ def meta
128
+ @meta ||= {}.tap do |meta|
129
+ item.css('postmeta').each do |m|
130
+ key = m.css('meta_key').text
131
+ value = m.css('meta_value').text
132
+
133
+ case meta[key]
134
+ when nil then meta[key] = value
135
+ when String then meta[key] = [meta[key], value]
136
+ when Array then meta[key] << value
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ # Order. Higher are sorted on top by jekyll-order.
143
+ #
144
+ # @return [Integer]
145
+ def order
146
+ @order ||= attribute_value 'is_sticky'
147
+ end
148
+
149
+ # Publication status
150
+ #
151
+ # @return [Boolean]
152
+ def published?
153
+ @published ||= attribute_value('status') == 'publish'
154
+ end
155
+
156
+ # Publication status
157
+ #
158
+ # @return [Boolean]
159
+ def draft?
160
+ @draft ||= attribute_value('status') == 'draft'
161
+ end
162
+
163
+ # Get a value from the attribute
164
+ #
165
+ # @return [String]
166
+ def attribute_value(key)
167
+ item.at_css(key).text
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require_relative 'wordpress_xml/post'
5
+ require_relative 'wordpress_xml/attachment'
6
+
7
+ module SuttyMigration
8
+ # Understands the XML dump generated by Wordpress and creates
9
+ # Jekyll::Documents
10
+ class WordpressXml
11
+ attr_reader :site, :file, :xml
12
+
13
+ # @param :site [Jekyll::Site] Jekyll site
14
+ # @param :file [String] File path
15
+ def initialize(site:, file:)
16
+ @site = site
17
+ @file = file
18
+ @xml = Nokogiri::XML File.read(file)
19
+
20
+ # Make things easier by removing namespaces.
21
+ xml.remove_namespaces!
22
+ end
23
+
24
+ def inspect
25
+ '#<SuttyMigration::WordpressXml>'
26
+ end
27
+
28
+ # Site URL
29
+ #
30
+ # @return [String]
31
+ def url
32
+ @url ||= attribute_value(xml, 'channel > link')
33
+ end
34
+
35
+ # Site title
36
+ #
37
+ # @return [String]
38
+ def title
39
+ @title ||= attribute_value(xml, 'channel > title')
40
+ end
41
+
42
+ # Description
43
+ #
44
+ # @return [String]
45
+ def description
46
+ @description ||= attribute_value(xml, 'channel > description')
47
+ end
48
+
49
+ # Language
50
+ #
51
+ # TODO: Migrate multilanguage sites.
52
+ #
53
+ # @return [String]
54
+ def language
55
+ @language ||= attribute_value(xml, 'channel > language')
56
+ end
57
+
58
+ # Authors with attributes, indexed by author email.
59
+ #
60
+ # @return [Hash]
61
+ def authors
62
+ @authors ||= xml.css('channel > author').map do |author|
63
+ {
64
+ attribute_value(author, 'author_email') => {
65
+ id: attribute_value(author, 'author_id').to_i,
66
+ display_name: attribute_value(author, 'author_display_name'),
67
+ first_name: attribute_value(author, 'author_first_name'),
68
+ last_name: attribute_value(author, 'author_last_name'),
69
+ email: attribute_value(author, 'author_email')
70
+
71
+ }
72
+ }
73
+ end.reduce(&:merge)
74
+ end
75
+
76
+ # Categories with attributes, indexed by slug ("nicename")
77
+ #
78
+ # @return [Hash]
79
+ def categories
80
+ @categories ||= xml.css('channel > category').map do |category|
81
+ {
82
+ attribute_value(category, 'category_nicename') => {
83
+ id: attribute_value(category, 'term_id').to_i,
84
+ title: attribute_value(category, 'cat_name'),
85
+ parent: attribute_value(category, 'category_parent'),
86
+ slug: attribute_value(category, 'category_nicename')
87
+ }
88
+ }
89
+ end.reduce(&:merge)
90
+ end
91
+
92
+ # Tags with attributes, indexed by slug
93
+ #
94
+ # @return [Hash]
95
+ def tags
96
+ @tags ||= xml.css('channel > tag').map do |tag|
97
+ {
98
+ attribute_value(tag, 'tag_slug') => {
99
+ id: attribute_value(tag, 'term_id').to_i,
100
+ title: attribute_value(tag, 'tag_name'),
101
+ slug: attribute_value(tag, 'tag_slug')
102
+ }
103
+ }
104
+ end.reduce(&:merge)
105
+ end
106
+
107
+ # Posts, indexed by ID
108
+ #
109
+ # @return [Hash]
110
+ def posts
111
+ @posts ||= items_find_by('post_type', 'post').map do |post|
112
+ { attribute_value(post, 'post_id').to_i => Post.new(wordpress: self, item: post) }
113
+ end.reduce(&:merge)
114
+ end
115
+
116
+ # Pages, indexed by ID
117
+ #
118
+ # @return [Hash]
119
+ def pages
120
+ @pages ||= items_find_by('post_type', 'page').map do |page|
121
+ { attribute_value(page, 'post_id').to_i => Post.new(wordpress: self, item: page) }
122
+ end.reduce(&:merge)
123
+ end
124
+
125
+ # Attachments, indexed by ID
126
+ #
127
+ # @return [Hash]
128
+ def attachments
129
+ @attachments ||= items_find_by('post_type', 'attachment').map do |attachment|
130
+ { attribute_value(attachment, 'post_id').to_i => Attachment.new(wordpress: self, item: attachment) }
131
+ end.reduce(&:merge)
132
+ end
133
+
134
+ # Find items by attribute and value
135
+ #
136
+ # @param [String] Attribute name
137
+ # @param [String] Attribute value
138
+ # @return [Nokogiri::NodeSet]
139
+ def items_find_by(attribute, value)
140
+ xml.css('channel > item').select do |item|
141
+ attribute_value(item, attribute) == value
142
+ end
143
+ end
144
+
145
+ # Get element's attribute value
146
+ #
147
+ # @param [Nokogiri::XML::Element]
148
+ # @param [String]
149
+ # @return [String]
150
+ def attribute_value(element, attribute)
151
+ element.at_css(attribute).text
152
+ end
153
+ end
154
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sutty-migration
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - f
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-09 00:00:00.000000000 Z
11
+ date: 2021-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: jekyll
@@ -108,6 +108,48 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '5.45'
111
+ - !ruby/object:Gem::Dependency
112
+ name: wordpress-formatting
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: 0.1.0
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: 0.1.0
125
+ - !ruby/object:Gem::Dependency
126
+ name: nokogiri
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: 1.12.0
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: 1.12.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: php-serialize
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 1.3.0
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 1.3.0
111
153
  - !ruby/object:Gem::Dependency
112
154
  name: pry
113
155
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +180,9 @@ files:
138
180
  - lib/sutty_migration/data.rb
139
181
  - lib/sutty_migration/jekyll/document_creator.rb
140
182
  - lib/sutty_migration/wordpress.rb
141
- - lib/wordpress.rb
183
+ - lib/sutty_migration/wordpress_xml.rb
184
+ - lib/sutty_migration/wordpress_xml/attachment.rb
185
+ - lib/sutty_migration/wordpress_xml/post.rb
142
186
  homepage: https://0xacab.org/sutty/jekyll/sutty-migration
143
187
  licenses:
144
188
  - GPL-3.0
data/lib/wordpress.rb DELETED
@@ -1,192 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # Generar UUIDs
4
- require 'securerandom'
5
- # Traer resultados de la base de datos
6
- require 'sequel'
7
- require 'sqlite3'
8
- require 'json'
9
- # Descargar archivos
10
- require 'faraday'
11
- require 'progressbar'
12
-
13
- class Wordpress
14
- attr_reader :site, :prefix, :limit, :url
15
-
16
- def initialize(site:, url:, prefix: 'wp_', limit: 10)
17
- @site = site
18
- @prefix = prefix.freeze
19
- @limit = limit.freeze
20
- @url = url.freeze
21
-
22
- # Conectarse a la base de datos
23
- @wp = Sequel.sqlite(File.join(site.source, '_data', 'wordpress', 'post.sqlite3'))
24
- # Las funciones de JSON usan mucha CPU, vamos a traer de a pocos
25
- # registros.
26
- @wp.extension :pagination
27
- end
28
-
29
- def download(file)
30
- dest = "wp-content/uploads/#{file}"
31
- full = File.join(site.source, dest)
32
-
33
- return dest if File.exist? full
34
-
35
- Jekyll.logger.info "Downloading #{dest}"
36
-
37
- FileUtils.mkdir_p File.dirname(full)
38
-
39
- File.open(full, 'w') do |f|
40
- url = "#{url}/#{dest}"
41
- head = Faraday.head(url)
42
- content_length = head.headers['content-length']
43
- progress_bar = ProgressBar.new
44
-
45
- Faraday.get(url) do |req|
46
- req.options.on_data = Proc.new do |chunk, downloaded_bytes|
47
- f.write chunk
48
- end
49
- end
50
- end
51
-
52
- dest
53
- end
54
-
55
- # Obtiene todos los tipos de artículos disponibles
56
- #
57
- # @return [Array]
58
- def layouts
59
- @layouts ||= @wp["select distinct post_type from #{prefix}posts"].to_a.map(&:values).flatten
60
- end
61
-
62
- # Obtiene todos los posts opcionalmente filtrando por tipo de post.
63
- # No es la forma oficial de Sequel pero no tenemos tiempo de
64
- # aprenderla específicamente y además tenemos las opciones en formato
65
- # JSON que no estarían soportadas.
66
- #
67
- # @param :layout [String] Layout name, one of #layouts
68
- # @param :with_meta [Boolean]
69
- # @return [Enumerator]
70
- def posts(**options)
71
- if options[:layout] && !layouts.include?(options[:layout])
72
- raise ArgumentError, "#{layout} must be one of #{layouts.join(', ')}"
73
- end
74
-
75
- @posts ||= {}
76
- @posts[options[:layout] || 'all'] ||= @wp[post_query(**options)].each_page(limit).to_a.map(&:to_a).flatten.tap do |p|
77
- next unless options[:with_meta]
78
-
79
- p.map do |post|
80
- post[:front_matter] = JSON.parse(post[:front_matter]) unless post[:front_matter].nil?
81
- post[:terms] = JSON.parse(post[:terms]) unless post[:terms].nil?
82
- end
83
- end
84
- end
85
-
86
- private
87
-
88
- # Consulta para los posts, incluyendo metadatos en JSON. Los
89
- # metadatos vienen en dos partes porque tienen dos
90
- #
91
- # @return [String]
92
- def post_query(layout: nil, with_meta: true)
93
- @post_query ||= <<~EOQ
94
- select
95
- p.ID as id,
96
- p.post_title as title,
97
- p.post_name as slug,
98
- p.post_type as layout,
99
- p.strftime('%Y-%m-%d', post_date) as date,
100
- p.post_status as status,
101
- p.post_content as content
102
- #{", json_group_object(f.meta_key, f.meta_value) as front_matter" if with_meta}
103
- #{", t.meta as meta" if with_meta}
104
- from #{prefix}posts as p
105
- left join #{prefix}postmeta as f on p.ID = f.post_id
106
- #{"left join (#{meta_query(layout: layout)}) as as t on t.id = p.ID" if with_meta}
107
- #{"where p.post_type = :layout" if layout}
108
- group by p.ID
109
- EOQ
110
- end
111
-
112
- #
113
- def meta_query(layout: nil)
114
- @meta_query ||= <<~EOQ
115
- select
116
- p.ID as id,
117
- json_group_object(tt.taxonomy, t.name) as meta
118
- from #{prefix}posts as p
119
- left join #{prefix}term_relationships as r on r.object_id = p.ID
120
- left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
121
- left join #{prefix}terms as t on t.term_id = tt.term_id
122
- #{"where p.post_type = :layout" if layout}
123
- group by p.ID
124
- EOQ
125
- end
126
- end
127
-
128
- # Antes de generar el sitio vamos a leer todos los artículos desde la
129
- # base de datos y generarlos localmente.
130
- Jekyll::Hooks.register :site, :post_read do |site|
131
- wp = Wordpress.new(site: site,
132
- url: site.config.dig('wordpress', 'url'),
133
- prefix: site.config.dig('wordpress', 'prefix'))
134
-
135
- collection = site.collections['posts']
136
- ascii_re = Regexp.new("\P{ASCII}").freeze
137
- sanitizer = Rails::Html::SafeListSanitizer.new
138
-
139
- # Traer todas las imágenes cargadas y descargarlas
140
- attachments = wp.posts(layout: 'attachment').map do |page|
141
- page.map do |attachment|
142
- attachment[:data] = JSON.parse(attachment[:data]) unless attachment[:data].nil?
143
- file = attachment.dig(:data, '_wp_attached_file')
144
-
145
- next unless file
146
-
147
- dest = wp.download(file)
148
-
149
- # Tener un mapa de IDs y archivos destino
150
- [ attachment[:id], dest ]
151
- end
152
- end.compact.flatten(1).to_h
153
-
154
- %w[post page].each do |type|
155
- wp.posts(layout: type).each do |page|
156
- page.each do |post|
157
- # Convertir los datos extra en un Hash
158
- post[:data] = JSON.parse(post[:data]) unless post[:data].nil?
159
- post[:slug] = Jekyll::Utils.slugify(post[:title], mode: 'latin') if post[:slug].empty?
160
- post[:meta] = wp.meta id: post[:id]
161
-
162
- path = File.join(site.source, '_posts', post.slice(:date, :slug).values.join('-') + '.markdown')
163
-
164
- if File.exist? path
165
- Jekyll.logger.info "#{path} ya fue migrado, actualizando"
166
-
167
- doc = site.documents.find do |d|
168
- d['id'] == post[:id]
169
- end
170
- else
171
- # Crear un post nuevo y agregarlo a la colección
172
- collection.docs << doc = Jekyll::Document.new(path, site: site, collection: collection)
173
- doc.data['uuid'] = SecureRandom.uuid
174
- end
175
-
176
- thumbnail = post.dig(:data, '_thumbnail_id')&.to_i
177
-
178
- doc.data['layout'] = type
179
- doc.data['title'] = post[:title]
180
- doc.data['draft'] = post[:status] != 'publish'
181
- doc.data['id'] = post[:id]
182
- doc.data['date'] = Jekyll::Utils.parse_date(post[:date])
183
- doc.data['tags'] = post[:meta].select { |k| k[:type] == 'post_tag' }.map { |k| k[:name] }
184
- doc.data['categories'] = post[:meta].select { |k| k[:type] == 'category' }.map { |k| k[:name] }
185
- doc.data['image'] = attachments[thumbnail] if thumbnail
186
-
187
- doc.content = ReverseMarkdown.convert(sanitizer.sanitize(post[:content]))
188
- doc.save
189
- end
190
- end
191
- end
192
- end