RubyGems - sutty-migration - Versions diffs - 0.2.0 → 0.3.1 - Mend

sutty-migration 0.2.0 → 0.3.1

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +50 -0
data/lib/sutty_migration/data.rb +38 -30
data/lib/sutty_migration/jekyll/document_creator.rb +66 -22
data/lib/sutty_migration/wordpress.rb +74 -22
data/lib/sutty_migration/wordpress_xml/attachment.rb +69 -0
data/lib/sutty_migration/wordpress_xml/post.rb +171 -0
data/lib/sutty_migration/wordpress_xml.rb +154 -0
metadata +47 -3
data/lib/wordpress.rb +0 -192

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 046cf945de1c0736e329224151a4b331c87e44cab6e6c2bc1e69f22b7639fe68
-  data.tar.gz: 58effbee202ab51c7ff1ed4e8e98498cd5a65f618c6ae6fc4f924fe6aed2e0e1
+  metadata.gz: e94ad861c92188564cbac8821283cd8d6229ae2c85f44ebc24dbc235228de794
+  data.tar.gz: e2ae77d06641891aeb6572693536fcfb2e436e2691abf8994cdc540e92104780
 SHA512:
-  metadata.gz: a21cb549bddd9bc55218c0633932300811e547d2d4cfde2525fbcdcaf8a2ff3bb89f0542813b46fdb9b90c9d739166ce37e303fb4de76b918b52bee9402fcb6d
-  data.tar.gz: 544f18359b4e9996c07f6828643bb8d4856d55457b4a6134b2ef2cedf01d4471d294effc6024598ef66f9b1d2258345f44ac370e1678166a2b9d19b5c4a55e74
+  metadata.gz: 9b3382a28169ae769d3993ef5fa7c01421fb8eb951b27e383ec084e22951242a110e11704f8f38ec198740d1673b096044d7f2bf00e63daa72294feffab04cc9
+  data.tar.gz: 661841af686da59fa2ebc08be9c0d992e5fb3965e25a6c4751e0ff192ddd65dde2cc426c0fcd610ae1a19fd494b0e1c5d245459848ab6c768052480a6a96af0f

data/README.md CHANGED Viewed

@@ -23,6 +23,7 @@ Add the plugin to your `_config.yml`:
 ```yaml
 plugins:
 - sutty-migration
+array_separator: ','
 ```
 Compile a CSV file with the following required fields:
@@ -114,6 +115,55 @@ Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
 end
 ```
+### WordPress XML
+If you have the XML dump from a WordPress site, you can migrate content
+by writing a migration plugin.
+```ruby
+# frozen_string_literal: true
+require 'sutty_migration/jekyll/document_creator'
+require 'sutty_migration/wordpress_xml'
+require 'jekyll-write-and-commit-changes'
+require 'securerandom'
+# Run after reading the site
+Jekyll::Hooks.register :site, :post_read do |site|
+  # Put the XML dump at _files/wordpress.xml
+  xml = SuttyMigration::WordpressXml.new site: site, file: '_files/wordpress.xml'
+  # Download all files
+  xml.attachments.values.map(&:download)
+  # Migrate posts.  You can move metadata around and recover
+  # relationships or any info your theme requires.
+  xml.posts.values.each do |post|
+    # Update documents already migrated.
+    doc = Jekyll::Document.find_or_create(site: site, collection: locale, title: post.title, slug: post.slug, date: post.date)
+    # Don't change the UUIDv4
+    d.data['uuid'] ||= SecureRandom.uuid
+    d.data['draft'] = post.draft?
+    d.data['layout'] = 'post'
+    d.data['last_modified_at'] = post.last_modified_at
+    d.data['categories'] = post.categories.map { |c| c[:title] }
+    d.data['tags'] = post.tags.map { |t| t[:title] }
+    d.data['author'] = post.author[:email]
+    d.data['description'] = post.description
+    d.content = post.content
+    doc.save
+  rescue => e
+    Jekyll.logger.warn "Couldn't migrate #{post.title}"
+  end
+  exit # Stop here
+end
+```
 ## Contributing
 Bug reports and pull requests are welcome on 0xacab.org at

data/lib/sutty_migration/data.rb CHANGED Viewed

@@ -13,6 +13,8 @@ require_relative 'jekyll/document_creator'
 Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
   documents = site.documents
+  array_separator = site.config.fetch('array_separator', ',')
   site.data['layouts']&.each do |name, layout|
     site.data.dig('migration', name)&.each do |row|
       row['date'] = Jekyll::Utils.parse_date(row['date']) unless row['date'].blank?
@@ -24,41 +26,47 @@ Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
         end
       end
-      document ||= Jekyll::Document.create(site: site, collection: 'posts', **row.slice(*%w[date slug title]).transform_keys(&:to_sym))
+      document ||= begin
+                     data = row.slice(*%w[date slug title]).transform_keys(&:to_sym)
+                     Jekyll::Document.find_or_create(site: site, collection: 'posts', **data)
+                   end
+      next unless document
       row.each do |attribute, value|
-        next unless value.blank?
+        next if value.nil? || value.blank?
+        value.strip! if value.is_a? String
         row[attribute] =
           case layout.dig(attribute, 'type')
-            when 'string' then value
-            when 'text' then value
-            when 'tel' then value
-            # TODO: validate
-            when 'color' then value
-            when 'date' then Jekyll::Utils.parse_date(value)
-            # TODO: validate
-            when 'email' then value
-            # TODO: validate
-            when 'url' then value
-            when 'content' then value
-            when 'markdown_content' then value
-            when 'markdown' then value
-            when 'number' then value.to_i
-            when 'order' then value.to_i
-            when 'boolean' then !value.strip.empty?
-            when 'array' then value.split(',').map(&:strip)
-            # TODO: process values from the default array
-            when 'predefined_array' then value.split(',').map(&:strip)
-            when 'image' then { 'path' => value, 'description' => '' }
-            when 'file' then { 'path' => value, 'description' => '' }
-            when 'geo' then %w[lat lng].zip(value.split(',', 2).map(&:to_f)).to_h
-            when 'belongs_to' then value
-            when 'has_many' then value.split(',').map(&:strip)
-            when 'has_and_belongs_to_many' then value.split(',').map(&:strip)
-            when 'related_posts' then value.split(',').map(&:strip)
-            when 'locales' then value.split(',').map(&:strip)
-            else value
+          when 'string' then value.tr("\n", ' ').squeeze(' ')
+          when 'text' then value.gsub("\n", "\n\n")
+          when 'tel' then value.tr("\n", ' ').squeeze(' ')
+          # TODO: validate
+          when 'color' then value.tr("\n", ' ').squeeze(' ')
+          when 'date' then Jekyll::Utils.parse_date(value)
+          # TODO: validate
+          when 'email' then value.tr("\n", ' ').squeeze(' ')
+          # TODO: validate
+          when 'url' then value.tr("\n", ' ').squeeze(' ')
+          when 'content' then value.gsub("\n", "\n\n")
+          when 'markdown_content' then value.gsub("\n", "\n\n")
+          when 'markdown' then value.gsub("\n", "\n\n")
+          when 'number' then value.to_i
+          when 'order' then value.to_i
+          when 'boolean' then !value.strip.empty?
+          when 'array' then value.split(array_separator).map(&:strip)
+          # TODO: process values from the default array
+          when 'predefined_array' then value.split(array_separator).map(&:strip)
+          when 'image' then { 'path' => value, 'description' => '' }
+          when 'file' then { 'path' => value, 'description' => '' }
+          when 'geo' then %w[lat lng].zip(value.split(array_separator, 2).map(&:to_f)).to_h
+          when 'belongs_to' then value
+          when 'has_many' then value.split(array_separator).map(&:strip)
+          when 'has_and_belongs_to_many' then value.split(array_separator).map(&:strip)
+          when 'related_posts' then value.split(array_separator).map(&:strip)
+          when 'locales' then value.split(array_separator).map(&:strip)
+          else value
           end
       end

data/lib/sutty_migration/jekyll/document_creator.rb CHANGED Viewed

@@ -7,31 +7,75 @@ module SuttyMigration
   module Jekyll
     module DocumentCreator
       class DocumentExists < ArgumentError; end
       def self.included(base)
         base.class_eval do
+          class << self
+            # Creates a new document in a collection or fails if it already
+            # exists.
+            #
+            # @param :site [Jekyll::Site] Jekyll site
+            # @param :date [Time] Post date
+            # @param :title [String] Post title
+            # @param :slug [String] Post slug, slugified title if empty
+            # @param :collection [Jekyll::Collection,String] Collection label or collection
+            # @return [Jekyll::Document] A new document
+            def create(site:, date:, title:, collection:, slug: nil)
+              collection = site.collections[collection] if collection.is_a? String
+              slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
+              basename = "#{date.strftime('%F')}-#{slug}.markdown"
+              path = File.join(collection.directory, basename)
-      # Creates a new document in a collection or fails if it already
-      # exists.
-      #
-      # @param :site [Jekyll::Site] Jekyll site
-      # @param :date [Time] Post date
-      # @param :title [String] Post title
-      # @param :slug [String] Post slug, slugified title if empty
-      # @param :collection [Jekyll::Collection,String] Collection label or collection
-      # @return [Jekyll::Document] A new document
-      def self.create(site:, date:, title:, slug: nil, collection:)
-        collection = site.collections[collection] if collection.is_a? String
-        slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
-        basename = "#{date.strftime('%F')}-#{slug}.markdown"
-        path = File.join(collection.directory, basename)
-        raise DocumentExists, "#{path} already exists" if File.exist? path
-        ::Jekyll::Document.new(path, site: site, collection: collection).tap do |document|
-          collection.docs << document
-          document.data['title'] = title
-        end
-      end
+              raise DocumentExists, "#{path} already exists" if File.exist? path
+              ::Jekyll::Document.new(path, site: site, collection: collection).tap do |document|
+                collection.docs << document
+                document.data['title'] = title
+              end
+            end
+            # Finds a document by its relative path or creates it if it
+            # doesn't exist.  Helpful for idempotent migrations (create or
+            # update actions)
+            #
+            # @param :site [Jekyll::Site] Jekyll site
+            # @param :date [Time] Post date
+            # @param :title [String] Post title
+            # @param :slug [String] Post slug, slugified title if empty
+            # @param :collection [Jekyll::Collection,String] Collection label or collection
+            # @return [Jekyll::Document] The found document or a new one
+            def find_or_create(site:, date:, title:, collection:, slug: nil)
+              collection = site.collections[collection] if collection.is_a? String
+              slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
+              basename = "#{date.strftime('%F')}-#{slug}.markdown"
+              path = File.join(collection.relative_directory, basename)
+              return find(site: site, relative_path: path) if File.exist?(path)
+              create(site: site, date: date, title: title, slug: slug, collection: collection)
+            end
+            # Finds a document by its relative path
+            #
+            # @param :site [Jekyll::Site]
+            # @param :relative_path [String]
+            # @return [Jekyll::Document,Nil]
+            def find(site:, relative_path:)
+              indexed_documents_by_relative_path(site)[relative_path]
+            end
+            # Index documents by relative path for faster finding
+            #
+            # @param [Jekyll::Site]
+            # @return [Hash]
+            def indexed_documents_by_relative_path(site)
+              @indexed_documents_by_relative_path ||= site.documents.reduce({}) do |idx, doc|
+                idx.tap do |i|
+                  i[doc.relative_path] = doc
+                end
+              end
+            end
+          end
         end
       end
     end

data/lib/sutty_migration/wordpress.rb CHANGED Viewed

@@ -38,15 +38,21 @@ module SuttyMigration
     #
     # @return [Hash] { "ID" => SuttyMigration::Wordpress }
     def blogs
-      @blogs ||= wp["select blog_id as id, domain, path from #{prefix}blogs"].to_a.map do |blog|
+      @blogs ||= wp["select * from #{prefix}blogs"].to_a.map do |blog|
         url   = "https://#{blog[:domain]}#{blog[:path]}"
-        pfx   = "#{prefix}#{blog[:id]}_" if blog[:id] > 1
+        pfx   = "#{prefix}#{blog[:blog_id]}_" if blog[:blog_id] > 1
         pfx ||= prefix
-        [ blog[:id], self.class.new(site: site, url: url, prefix: pfx, database: database, limit: limit, multisite: self) ]
+        [blog[:blog_id],
+         blog.merge(db: self.class.new(site: site, url: url, prefix: pfx, database: database, limit: limit,
+                                       multisite: self))]
       end.to_h
     end
+    def options
+      @options ||= wp["select option_name, option_value from #{prefix}options"].to_a.map(&:values).to_h.transform_keys(&:to_sym)
+    end
     # Open the database.
     #
     # @return [Sequel::SQLite::Database]
@@ -90,11 +96,11 @@ module SuttyMigration
         end
         Faraday.get(url) do |req|
-          req.options.on_data = Proc.new do |chunk, downloaded_bytes|
+          req.options.on_data = proc do |chunk, downloaded_bytes|
             f.write chunk
             if progress
-              progress.progress = (downloaded_bytes > content_length) ? content_length : downloaded_bytes
+              progress.progress = downloaded_bytes > content_length ? content_length : downloaded_bytes
             end
           end
         end
@@ -126,10 +132,31 @@ module SuttyMigration
         p.map do |post|
           # Sequel parses dates on localtime
           post[:date] = ::Jekyll::Utils.parse_date(post[:date]) unless post[:date].blank?
-          post[:last_modified_at] = ::Jekyll::Utils.parse_date(post[:last_modified_at]) unless post[:last_modified_at].blank?
+          unless post[:last_modified_at].blank?
+            post[:last_modified_at] =
+              ::Jekyll::Utils.parse_date(post[:last_modified_at])
+          end
-          post[:front_matter] = JSON.parse(post[:front_matter]).transform_keys(&:to_sym) unless post[:front_matter].blank?
-          post[:terms] = JSON.parse(post[:terms]).transform_keys(&:to_sym) unless post[:terms].blank?
+          post[:front_matter] =
+            begin
+              unless post[:front_matter].blank?
+                JSON.parse(post[:front_matter]).transform_keys(&:to_sym).transform_values do |v|
+                  v.size == 1 ? v.first : v
+                end
+              end
+            rescue JSON::ParserError
+              {}
+            end
+          post[:terms] =
+            begin
+              unless post[:terms].blank?
+                JSON.parse(post[:terms]).transform_keys(&:to_sym).transform_values do |v|
+                  v.size == 1 ? v.first : v
+                end
+              end
+            rescue JSON::ParserError
+              {}
+            end
         end
       end
     end
@@ -163,7 +190,7 @@ module SuttyMigration
       <<~EOQ
         select
           u.*
-          #{", json_group_object(m.meta_key, m.meta_value) as meta" if with_meta}
+          #{', json_group_object(m.meta_key, m.meta_value) as meta' if with_meta}
         from #{pfx}users as u
         #{"left join #{pfx}usermeta as m on m.user_id = u.id" if with_meta}
         group by u.id
@@ -199,31 +226,56 @@ module SuttyMigration
           p.menu_order as menu_order,
           p.post_mime_type as mime_type,
           p.comment_count as comment_count
-          #{", json_group_object(f.meta_key, f.meta_value) as front_matter" if with_meta}
-          #{", t.terms as terms" if with_meta}
+          #{', f.front_matter as front_matter' if with_meta}
+          #{', t.terms as terms' if with_meta}
         from #{prefix}posts as p
-        left join #{prefix}postmeta as f on p.ID = f.post_id
-        #{"left join (#{terms_query(layout: layout)}) as t on t.id = p.ID" if with_meta}
+        #{"left join (#{meta_query}) as f on f.post_id = p.ID" if with_meta}
+        #{"left join (#{terms_query}) as t on t.post_id = p.ID" if with_meta}
         #{"where p.post_type = '#{layout}'" if layout}
         group by p.ID
       EOQ
     end
+    # Recover the post meta as a JSON object with multiple values
+    # converted to arrays
+    #
+    # @return [String]
+    def meta_query
+      <<~EOQ
+        select
+          post_id,
+          json_group_object(meta_key, json(meta_values)) as front_matter
+        from (
+          select
+            post_id,
+            meta_key,
+            json_group_array(meta_value) as meta_values
+          from #{prefix}postmeta
+          group by post_id, meta_key
+        )
+        group by post_id
+      EOQ
+    end
     # Term taxonomy query
     #
     # @param :layout [String] Layout name
     # @return [String]
-    def terms_query(layout: nil)
+    def terms_query
       <<~EOQ
         select
-          p.ID as id,
-          json_group_object(tt.taxonomy, t.name) as terms
-        from #{prefix}posts as p
-        left join #{prefix}term_relationships as r on r.object_id = p.ID
-        left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
-        left join #{prefix}terms as t on t.term_id = tt.term_id
-        #{"where p.post_type = '#{layout}'" if layout}
-        group by p.ID
+          post_id,
+          json_group_object(taxonomy, json(terms)) as terms
+        from (
+          select
+            r.object_id as post_id,
+            tt.taxonomy,
+            json_group_array(t.name) as terms
+          from #{prefix}term_relationships as r
+          left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
+          left join #{prefix}terms as t on t.term_id = tt.term_id
+          group by r.object_id)
+        group by post_id
       EOQ
     end
   end

data/lib/sutty_migration/wordpress_xml/attachment.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+require_relative 'post'
+require 'php-serialize'
+require 'faraday'
+require 'progressbar'
+module SuttyMigration
+  class WordpressXml
+    # Represents an attachment or uploaded file.
+    class Attachment < Post
+      # File URL
+      #
+      # @return [String]
+      def attachment_url
+        @attachment_url ||= attribute_value 'attachment_url'
+      end
+      # File destination
+      #
+      # @return [String]
+      def dest
+        @dest ||= URI(attachment_url).path.sub(%r{\A/}, '')
+      end
+      # Metadata, with file information as a Hash
+      #
+      # @return [Hash]
+      def meta
+        super.tap do |m|
+          m['_wp_attachment_metadata'] = PHP.unserialize m['_wp_attachment_metadata']
+        end
+      end
+      # Download the file if it doesn't exist.  Optionally show a
+      # progress bar.
+      #
+      # @param :progress [Boolean]
+      # @return [Boolean]
+      def download(progress: true)
+        return true if File.exist? dest
+        ::Jekyll.logger.info "Downloading #{dest}"
+        FileUtils.mkdir_p File.dirname(dest)
+        File.open(dest, 'w') do |f|
+          if progress
+            head = Faraday.head(attachment_url)
+            content_length = head.headers['content-length'].to_i
+            progress = ProgressBar.create(title: File.basename(dest), total: content_length, output: $stderr)
+          end
+          Faraday.get(attachment_url) do |req|
+            req.options.on_data = proc do |chunk, downloaded_bytes|
+              f.write chunk
+              if progress
+                progress.progress = downloaded_bytes > content_length ? content_length : downloaded_bytes
+              end
+            end
+          end
+        end
+        File.exist? dest
+      end
+    end
+  end
+end

data/lib/sutty_migration/wordpress_xml/post.rb ADDED Viewed

@@ -0,0 +1,171 @@
+# frozen_string_literal: true
+require 'wordpress_formatting/wpautop'
+require 'jekyll/utils'
+module SuttyMigration
+  class WordpressXml
+    # Represents a WordPress post
+    class Post
+      attr_reader :wordpress, :item
+      # @param :wordpress [SuttyMigration::WordpressXml]
+      # @param :item [Nokogiri::XML::Element]
+      def initialize(wordpress:, item:)
+        @wordpress = wordpress
+        @item = item
+      end
+      def inspect
+        "#<SuttyMigration::WordpressXml::Post title=\"#{title}\">"
+      end
+      # Post ID
+      #
+      # @return [Integer]
+      def id
+        @id ||= attribute_value('post_id').to_i
+      end
+      # Permalink. Absolute URL to the post.
+      #
+      # @return [String]
+      def permalink
+        @permalink ||= attribute_value('link').sub(wordpress.url, '')
+      end
+      # Title
+      #
+      # @return [String]
+      def title
+        @title ||= attribute_value('title')
+      end
+      # Description
+      #
+      # @return [String]
+      def description
+        @description ||= attribute_value('description')
+      end
+      # Slug ("post name")
+      #
+      # @return [String]
+      def slug
+        @slug ||= attribute_value('post_name')
+      end
+      # Publication date.
+      #
+      # WordPress can store this date in three different fields and
+      # sometimes they come empty or invalid.
+      #
+      # @return [Time]
+      def date
+        @date ||= %w[pubDate post_date_gmt post_date].map do |date_attr|
+          ::Jekyll::Utils.parse_date attribute_value(date_attr)
+        rescue StandardError
+        end.compact.first
+      end
+      # Modification date.
+      #
+      # @return [Time]
+      def last_modified_at
+        @last_modified_at ||= ::Jekyll::Utils.parse_date attribute_value('post_modified_gmt')
+      end
+      # Content as HTML, with site URL removed.
+      #
+      # @return [String]
+      def content
+        @content ||= WordpressFormatting::Wpautop.wpautop(attribute_value('encoded')).gsub(
+          / (href|src)="#{wordpress.url}/, ' \\1="'
+        )
+      end
+      # Author attributes.
+      #
+      # @return [Hash]
+      def author
+        @author ||= wordpress.authors[attribute_value('creator')]
+      end
+      # Post password.  Use with jekyll-crypto.
+      #
+      # @return [String]
+      def password
+        @password ||= attribute_value 'post_password'
+      end
+      # Tags with attributes.
+      #
+      # @return [Hash]
+      def tags
+        @tags ||= item.css('category').select do |c|
+          c[:domain] == 'post_tag'
+        end.map do |c|
+          wordpress.tags[c[:nicename]]
+        end
+      end
+      # Categories with attributes.
+      #
+      # @return [Hash]
+      def categories
+        @categories ||= item.css('category').select do |c|
+          c[:domain] == 'category'
+        end.map do |c|
+          wordpress.categories[c[:nicename]]
+        end
+      end
+      # Metadata.  Plugins store useful information here.  Duplicated
+      # keys are returned as an Array of values.
+      #
+      # @return [Hash]
+      def meta
+        @meta ||= {}.tap do |meta|
+          item.css('postmeta').each do |m|
+            key = m.css('meta_key').text
+            value = m.css('meta_value').text
+            case meta[key]
+            when nil then meta[key] = value
+            when String then meta[key] = [meta[key], value]
+            when Array then meta[key] << value
+            end
+          end
+        end
+      end
+      # Order.  Higher are sorted on top by jekyll-order.
+      #
+      # @return [Integer]
+      def order
+        @order ||= attribute_value 'is_sticky'
+      end
+      # Publication status
+      #
+      # @return [Boolean]
+      def published?
+        @published ||= attribute_value('status') == 'publish'
+      end
+      # Publication status
+      #
+      # @return [Boolean]
+      def draft?
+        @draft ||= attribute_value('status') == 'draft'
+      end
+      # Get a value from the attribute
+      #
+      # @return [String]
+      def attribute_value(key)
+        item.at_css(key).text
+      end
+    end
+  end
+end

data/lib/sutty_migration/wordpress_xml.rb ADDED Viewed

@@ -0,0 +1,154 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require_relative 'wordpress_xml/post'
+require_relative 'wordpress_xml/attachment'
+module SuttyMigration
+  # Understands the XML dump generated by Wordpress and creates
+  # Jekyll::Documents
+  class WordpressXml
+    attr_reader :site, :file, :xml
+    # @param :site [Jekyll::Site] Jekyll site
+    # @param :file [String] File path
+    def initialize(site:, file:)
+      @site = site
+      @file = file
+      @xml  = Nokogiri::XML File.read(file)
+      # Make things easier by removing namespaces.
+      xml.remove_namespaces!
+    end
+    def inspect
+      '#<SuttyMigration::WordpressXml>'
+    end
+    # Site URL
+    #
+    # @return [String]
+    def url
+      @url ||= attribute_value(xml, 'channel > link')
+    end
+    # Site title
+    #
+    # @return [String]
+    def title
+      @title ||= attribute_value(xml, 'channel > title')
+    end
+    # Description
+    #
+    # @return [String]
+    def description
+      @description ||= attribute_value(xml, 'channel > description')
+    end
+    # Language
+    #
+    # TODO: Migrate multilanguage sites.
+    #
+    # @return [String]
+    def language
+      @language ||= attribute_value(xml, 'channel > language')
+    end
+    # Authors with attributes, indexed by author email.
+    #
+    # @return [Hash]
+    def authors
+      @authors ||= xml.css('channel > author').map do |author|
+        {
+          attribute_value(author, 'author_email') => {
+            id: attribute_value(author, 'author_id').to_i,
+            display_name: attribute_value(author, 'author_display_name'),
+            first_name: attribute_value(author, 'author_first_name'),
+            last_name: attribute_value(author, 'author_last_name'),
+            email: attribute_value(author, 'author_email')
+          }
+        }
+      end.reduce(&:merge)
+    end
+    # Categories with attributes, indexed by slug ("nicename")
+    #
+    # @return [Hash]
+    def categories
+      @categories ||= xml.css('channel > category').map do |category|
+        {
+          attribute_value(category, 'category_nicename') => {
+            id: attribute_value(category, 'term_id').to_i,
+            title: attribute_value(category, 'cat_name'),
+            parent: attribute_value(category, 'category_parent'),
+            slug: attribute_value(category, 'category_nicename')
+          }
+        }
+      end.reduce(&:merge)
+    end
+    # Tags with attributes, indexed by slug
+    #
+    # @return [Hash]
+    def tags
+      @tags ||= xml.css('channel > tag').map do |tag|
+        {
+          attribute_value(tag, 'tag_slug') => {
+            id: attribute_value(tag, 'term_id').to_i,
+            title: attribute_value(tag, 'tag_name'),
+            slug: attribute_value(tag, 'tag_slug')
+          }
+        }
+      end.reduce(&:merge)
+    end
+    # Posts, indexed by ID
+    #
+    # @return [Hash]
+    def posts
+      @posts ||= items_find_by('post_type', 'post').map do |post|
+        { attribute_value(post, 'post_id').to_i => Post.new(wordpress: self, item: post) }
+      end.reduce(&:merge)
+    end
+    # Pages, indexed by ID
+    #
+    # @return [Hash]
+    def pages
+      @pages ||= items_find_by('post_type', 'page').map do |page|
+        { attribute_value(page, 'post_id').to_i => Post.new(wordpress: self, item: page) }
+      end.reduce(&:merge)
+    end
+    # Attachments, indexed by ID
+    #
+    # @return [Hash]
+    def attachments
+      @attachments ||= items_find_by('post_type', 'attachment').map do |attachment|
+        { attribute_value(attachment, 'post_id').to_i => Attachment.new(wordpress: self, item: attachment) }
+      end.reduce(&:merge)
+    end
+    # Find items by attribute and value
+    #
+    # @param [String] Attribute name
+    # @param [String] Attribute value
+    # @return [Nokogiri::NodeSet]
+    def items_find_by(attribute, value)
+      xml.css('channel > item').select do |item|
+        attribute_value(item, attribute) == value
+      end
+    end
+    # Get element's attribute value
+    #
+    # @param [Nokogiri::XML::Element]
+    # @param [String]
+    # @return [String]
+    def attribute_value(element, attribute)
+      element.at_css(attribute).text
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: sutty-migration
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.1
 platform: ruby
 authors:
 - f
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-06-09 00:00:00.000000000 Z
+date: 2021-08-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: jekyll
@@ -108,6 +108,48 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '5.45'
+- !ruby/object:Gem::Dependency
+  name: wordpress-formatting
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.12.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.12.0
+- !ruby/object:Gem::Dependency
+  name: php-serialize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.3.0
 - !ruby/object:Gem::Dependency
   name: pry
   requirement: !ruby/object:Gem::Requirement
@@ -138,7 +180,9 @@ files:
 - lib/sutty_migration/data.rb
 - lib/sutty_migration/jekyll/document_creator.rb
 - lib/sutty_migration/wordpress.rb
-- lib/wordpress.rb
+- lib/sutty_migration/wordpress_xml.rb
+- lib/sutty_migration/wordpress_xml/attachment.rb
+- lib/sutty_migration/wordpress_xml/post.rb
 homepage: https://0xacab.org/sutty/jekyll/sutty-migration
 licenses:
 - GPL-3.0

data/lib/wordpress.rb DELETED Viewed

@@ -1,192 +0,0 @@
-# frozen_string_literal: true
-# Generar UUIDs
-require 'securerandom'
-# Traer resultados de la base de datos
-require 'sequel'
-require 'sqlite3'
-require 'json'
-# Descargar archivos
-require 'faraday'
-require 'progressbar'
-class Wordpress
-  attr_reader :site, :prefix, :limit, :url
-  def initialize(site:, url:, prefix: 'wp_', limit: 10)
-    @site = site
-    @prefix = prefix.freeze
-    @limit = limit.freeze
-    @url = url.freeze
-    # Conectarse a la base de datos
-    @wp = Sequel.sqlite(File.join(site.source, '_data', 'wordpress', 'post.sqlite3'))
-    # Las funciones de JSON usan mucha CPU, vamos a traer de a pocos
-    # registros.
-    @wp.extension :pagination
-  end
-  def download(file)
-    dest = "wp-content/uploads/#{file}"
-    full = File.join(site.source, dest)
-    return dest if File.exist? full
-    Jekyll.logger.info "Downloading #{dest}"
-    FileUtils.mkdir_p File.dirname(full)
-    File.open(full, 'w') do |f|
-      url = "#{url}/#{dest}"
-      head = Faraday.head(url)
-      content_length = head.headers['content-length']
-      progress_bar = ProgressBar.new
-      Faraday.get(url) do |req|
-        req.options.on_data = Proc.new do |chunk, downloaded_bytes|
-          f.write chunk
-        end
-      end
-    end
-    dest
-  end
-  # Obtiene todos los tipos de artículos disponibles
-  #
-  # @return [Array]
-  def layouts
-    @layouts ||= @wp["select distinct post_type from #{prefix}posts"].to_a.map(&:values).flatten
-  end
-  # Obtiene todos los posts opcionalmente filtrando por tipo de post.
-  # No es la forma oficial de Sequel pero no tenemos tiempo de
-  # aprenderla específicamente y además tenemos las opciones en formato
-  # JSON que no estarían soportadas.
-  #
-  # @param :layout [String] Layout name, one of #layouts
-  # @param :with_meta [Boolean]
-  # @return [Enumerator]
-  def posts(**options)
-    if options[:layout] && !layouts.include?(options[:layout])
-      raise ArgumentError, "#{layout} must be one of #{layouts.join(', ')}"
-    end
-    @posts ||= {}
-    @posts[options[:layout] || 'all'] ||= @wp[post_query(**options)].each_page(limit).to_a.map(&:to_a).flatten.tap do |p|
-      next unless options[:with_meta]
-      p.map do |post|
-        post[:front_matter] = JSON.parse(post[:front_matter]) unless post[:front_matter].nil?
-        post[:terms] = JSON.parse(post[:terms]) unless post[:terms].nil?
-      end
-    end
-  end
-  private
-  # Consulta para los posts, incluyendo metadatos en JSON.  Los
-  # metadatos vienen en dos partes porque tienen dos
-  #
-  # @return [String]
-  def post_query(layout: nil, with_meta: true)
-    @post_query ||= <<~EOQ
-      select
-        p.ID as id,
-        p.post_title as title,
-        p.post_name as slug,
-        p.post_type as layout,
-        p.strftime('%Y-%m-%d', post_date) as date,
-        p.post_status as status,
-        p.post_content as content
-        #{", json_group_object(f.meta_key, f.meta_value) as front_matter" if with_meta}
-        #{", t.meta as meta" if with_meta}
-      from #{prefix}posts as p
-      left join #{prefix}postmeta as f on p.ID = f.post_id
-      #{"left join (#{meta_query(layout: layout)}) as as t on t.id = p.ID" if with_meta}
-      #{"where p.post_type = :layout" if layout}
-      group by p.ID
-    EOQ
-  end
-  #
-  def meta_query(layout: nil)
-    @meta_query ||= <<~EOQ
-      select
-        p.ID as id,
-        json_group_object(tt.taxonomy, t.name) as meta
-      from #{prefix}posts as p
-      left join #{prefix}term_relationships as r on r.object_id = p.ID
-      left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
-      left join #{prefix}terms as t on t.term_id = tt.term_id
-      #{"where p.post_type = :layout" if layout}
-      group by p.ID
-    EOQ
-  end
-end
-# Antes de generar el sitio vamos a leer todos los artículos desde la
-# base de datos y generarlos localmente.
-Jekyll::Hooks.register :site, :post_read do |site|
-  wp = Wordpress.new(site: site,
-                     url: site.config.dig('wordpress', 'url'),
-                     prefix: site.config.dig('wordpress', 'prefix'))
-  collection = site.collections['posts']
-  ascii_re = Regexp.new("\P{ASCII}").freeze
-  sanitizer = Rails::Html::SafeListSanitizer.new
-  # Traer todas las imágenes cargadas y descargarlas
-  attachments = wp.posts(layout: 'attachment').map do |page|
-    page.map do |attachment|
-      attachment[:data] = JSON.parse(attachment[:data]) unless attachment[:data].nil?
-      file = attachment.dig(:data, '_wp_attached_file')
-      next unless file
-      dest = wp.download(file)
-      # Tener un mapa de IDs y archivos destino
-      [ attachment[:id], dest ]
-    end
-  end.compact.flatten(1).to_h
-  %w[post page].each do |type|
-    wp.posts(layout: type).each do |page|
-      page.each do |post|
-        # Convertir los datos extra en un Hash
-        post[:data] = JSON.parse(post[:data]) unless post[:data].nil?
-        post[:slug] = Jekyll::Utils.slugify(post[:title], mode: 'latin') if post[:slug].empty?
-        post[:meta] = wp.meta id: post[:id]
-        path = File.join(site.source, '_posts', post.slice(:date, :slug).values.join('-') + '.markdown')
-        if File.exist? path
-          Jekyll.logger.info "#{path} ya fue migrado, actualizando"
-          doc = site.documents.find do |d|
-            d['id'] == post[:id]
-          end
-        else
-          # Crear un post nuevo y agregarlo a la colección
-          collection.docs << doc = Jekyll::Document.new(path, site: site, collection: collection)
-          doc.data['uuid'] = SecureRandom.uuid
-        end
-        thumbnail = post.dig(:data, '_thumbnail_id')&.to_i
-        doc.data['layout'] = type
-        doc.data['title'] = post[:title]
-        doc.data['draft'] = post[:status] != 'publish'
-        doc.data['id'] = post[:id]
-        doc.data['date'] = Jekyll::Utils.parse_date(post[:date])
-        doc.data['tags'] = post[:meta].select { |k| k[:type] == 'post_tag' }.map { |k| k[:name] }
-        doc.data['categories'] = post[:meta].select { |k| k[:type] == 'category' }.map { |k| k[:name] }
-        doc.data['image'] = attachments[thumbnail] if thumbnail
-        doc.content = ReverseMarkdown.convert(sanitizer.sanitize(post[:content]))
-        doc.save
-      end
-    end
-  end
-end