RubyGems - sutty-migration - Versions diffs - 0.1.2 → 0.3.0 - Mend

sutty-migration 0.1.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +98 -0
data/lib/sutty-migration.rb +1 -69
data/lib/sutty_migration/core_extensions.rb +26 -0
data/lib/sutty_migration/data.rb +80 -0
data/lib/sutty_migration/jekyll/document_creator.rb +85 -0
data/lib/sutty_migration/wordpress.rb +282 -0
data/lib/sutty_migration/wordpress_xml.rb +154 -0
data/lib/sutty_migration/wordpress_xml/attachment.rb +69 -0
data/lib/sutty_migration/wordpress_xml/post.rb +171 -0
metadata +121 -3
data/lib/wordpress.rb +0 -174

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ffe46cc7d270c7f30d4a505704cb244e507df588bb380ad2c101c3a01675844b
-  data.tar.gz: d63a9dd7fde09627c61f02f7b714e151f5122d5133e26d8c4d3be7917d35501e
+  metadata.gz: 5486653e0e1eb13f5c4c4f85235c875c782fee7be37a1bee9e4cdd84d5879d0a
+  data.tar.gz: '096ab9a992ad5b4cf36bb765a4eb99bd4ac9f2fc35ec25737906eb7c3abc8fdf'
 SHA512:
-  metadata.gz: 9c9278d28ab6d4b862c5cc615941102c5ef8716c30b674093dd31f5a6dce1c337ff6729fab69e6018263a113b3b82f8faadc9b8aad2a9bc38cfd472d082d711c
-  data.tar.gz: a016f1a5ff26e8c8c5c1c95d0393f6ebe497919b0301282d412440779ec44fbd0c06e7b7ba95a8d30169d3c936dd4eaf5d3f8e59e8fdd6c563504b6027125a02
+  metadata.gz: e94245fd5af90a7411b842e13c44a5f85bbbe2544449de98eaa9c52dbb70f095938754bb65dea382f5275df26b6753c37c5cea24c564ff8f98ad6a0f29406e0e
+  data.tar.gz: f415da3e9c4ebee1ec8a6101676ae17a319d05ee248c8360d834d7f321190e791eb8274eb6fa0c5fcc74be5c1d2e1b77195894cb9179c3e33626bd090636327b

data/README.md CHANGED Viewed

@@ -23,6 +23,7 @@ Add the plugin to your `_config.yml`:
 ```yaml
 plugins:
 - sutty-migration
+array_separator: ','
 ```
 Compile a CSV file with the following required fields:
@@ -66,6 +67,103 @@ To start migration just build your site:
 bundle exec jekyll build
 ```
+**Tip:** Files can also be JSON, TSV and YAML, since they're all
+supported by Jekyll.
+### Wordpress
+Instead of requiring you to install and configure MariaDB/MySQL, you can
+convert the database into SQLite3 like this:
+```bash
+git clone https://0xacab.org/sutty/mysql2sqlite.git
+cd mysql2sqlite
+./mysql2sqlite /path/to/database/dump.sql |
+  sed -re "s/, 0x([0-9a-f]+),/, X'\1',/i" |
+  sqlite3 wordpress.sqlite3
+```
+It will probably show some errors.
+Note the `sed` command is required to convert hexadecimal values into
+SQLite syntax, since `mysql2sqlite` doesn't support this yet.
+Wordpress websites can include lots of posts and metadata, depending on
+the amount of plugins installed.  We don't have an official way of
+dumping everything into Jekyll, because you will probably want to move
+things around.  You can write a plugin like this:
+```ruby
+# _plugins/wordpress.rb
+# frozen_string_literal: true
+require 'sutty_migration/wordpress'
+require 'sutty_migration/jekyll/document_creator'
+require 'jekyll-write-and-commit-changes'
+Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
+  wp = SuttyMigration::Wordpress.new(site: site, database: 'wordpress.sqlite3', prefix: 'wp_', url: 'https://wordpre.ss')
+  # Download all files
+  wp.download_all
+  wp.posts(layout: 'post').each do |post|
+    doc = Jekyll::Document.create(site: site, title: post[:post_title], date: post[:post_date], collection: 'posts')
+    doc.content = post[:content]
+    doc.save
+  end
+end
+```
+### WordPress XML
+If you have the XML dump from a WordPress site, you can migrate content
+by writing a migration plugin.
+```ruby
+# frozen_string_literal: true
+require 'sutty_migration/jekyll/document_creator'
+require 'sutty_migration/wordpress_xml'
+require 'jekyll-write-and-commit-changes'
+require 'securerandom'
+# Run after reading the site
+Jekyll::Hooks.register :site, :post_read do |site|
+  # Put the XML dump at _files/wordpress.xml
+  xml = SuttyMigration::WordpressXml.new site: site, file: '_files/wordpress.xml'
+  # Download all files
+  xml.attachments.values.map(&:download)
+  # Migrate posts.  You can move metadata around and recover
+  # relationships or any info your theme requires.
+  xml.posts.values.each do |post|
+    # Update documents already migrated.
+    doc = Jekyll::Document.find_or_create(site: site, collection: locale, title: post.title, slug: post.slug, date: post.date)
+    # Don't change the UUIDv4
+    d.data['uuid'] ||= SecureRandom.uuid
+    d.data['draft'] = post.draft?
+    d.data['layout'] = 'post'
+    d.data['last_modified_at'] = post.last_modified_at
+    d.data['categories'] = post.categories.map { |c| c[:title] }
+    d.data['tags'] = post.tags.map { |t| t[:title] }
+    d.data['author'] = post.author[:email]
+    d.data['description'] = post.description
+    d.content = post.content
+    doc.save
+  rescue => e
+    Jekyll.logger.warn "Couldn't migrate #{post.title}"
+  end
+  exit # Stop here
+end
+```
 ## Contributing
 Bug reports and pull requests are welcome on 0xacab.org at

data/lib/sutty-migration.rb CHANGED Viewed

@@ -1,71 +1,3 @@
 # frozen_string_literal: true
-require 'securerandom'
-require 'fast_blank'
-require 'jekyll-write-and-commit-changes'
-Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
-  documents = site.documents
-  site.data['layouts']&.each do |name, layout|
-    site.data.dig('migration', name)&.each do |row|
-      row['date'] = Jekyll::Utils.parse_date(row['date']) unless row['date'].blank?
-      if row['id']
-        document = documents.find do |doc|
-          doc.data['id'] == row['id']
-        end
-      end
-      document ||=
-        begin
-          base = "#{row['date'] || Date.today.to_s}-#{Jekyll::Utils.slugify(row['title'], mode: 'latin')}.markdown"
-          path = File.join(site.source, '_posts', base)
-          raise ArgumentError, "Row #{row['id']} duplicates file #{base}" if File.exist? path
-          doc = Jekyll::Document.new(path, site: site, collection: site.collections['posts'])
-          site.collections['posts'] << doc
-          doc
-        end
-      row.each do |attribute, value|
-        row[attribute] =
-          case layout.dig(attribute, 'type')
-            when 'string' then value
-            when 'text' then value
-            when 'tel' then value
-            when 'color' then value # TODO: validar
-            when 'date' then Jekyll::Utils.parse_date(value)
-            when 'email' then value # TODO: validar
-            when 'url' then value # TODO: validar
-            when 'content' then value
-            when 'markdown_content' then value
-            when 'markdown' then value
-            when 'number' then value.to_i
-            when 'order' then value.to_i
-            when 'boolean' then !value.strip.empty?
-            when 'array' then value.split(',').map(&:strip)
-            # TODO: procesar los valores en base a los valores predefinidos
-            when 'predefined_array' then value.split(',').map(&:strip)
-            when 'image' then { 'path' => value, 'description' => '' }
-            when 'file' then { 'path' => value, 'description' => '' }
-            when 'geo' then %w[lat lng].zip(value.split(',', 2).map(&:to_f)).to_h
-            when 'belongs_to' then value
-            when 'has_many' then value.split(',').map(&:strip)
-            when 'has_and_belongs_to_many' then value.split(',').map(&:strip)
-            when 'related_posts' then value.split(',').map(&:strip)
-            when 'locales' then value.split(',').map(&:strip)
-            else value
-          end
-      end
-      document.data['uuid'] ||= SecureRandom.uuid
-      document.content = row.delete('content')
-      document.data.merge! row
-      document.save
-    end
-  end
-end
+require_relative 'sutty_migration/data'

data/lib/sutty_migration/core_extensions.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+# Expandir String para poder verificar si está vacía
+require 'fast_blank'
+# Verificar que los valores nulos estén vacíos
+class NilClass
+  def blank?
+    true
+  end
+  def present?
+    false
+  end
+end
+# Verificar que una fecha está vacía
+class Time
+  def blank?
+    false
+  end
+  def present?
+    true
+  end
+end

data/lib/sutty_migration/data.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# frozen_string_literal: true
+require 'securerandom'
+require_relative 'core_extensions'
+require_relative 'jekyll/document_creator'
+# Registers a plugin for converting CSV files into posts following
+# Sutty's layout definition.
+#
+# If jekyll-write-and-commit-changes is enabled, documents will be saved
+# on disk and commited is the build command is run with
+# JEKYLL_ENV=production
+Jekyll::Hooks.register :site, :post_read, priority: :low do |site|
+  documents = site.documents
+  array_separator = site.config.fetch('array_separator', ',')
+  site.data['layouts']&.each do |name, layout|
+    site.data.dig('migration', name)&.each do |row|
+      row['date'] = Jekyll::Utils.parse_date(row['date']) unless row['date'].blank?
+      row['date'] ||= Time.now
+      unless row['id'].blank?
+        document = documents.find do |doc|
+          doc.data['id'] == row['id']
+        end
+      end
+      document ||= Jekyll::Document.create(site: site, collection: 'posts',
+                                           **row.slice(*%w[date slug title]).transform_keys(&:to_sym))
+      row.each do |attribute, value|
+        next unless value.blank?
+        row[attribute] =
+          case layout.dig(attribute, 'type')
+          when 'string' then value
+          when 'text' then value
+          when 'tel' then value
+          # TODO: validate
+          when 'color' then value
+          when 'date' then Jekyll::Utils.parse_date(value)
+          # TODO: validate
+          when 'email' then value
+          # TODO: validate
+          when 'url' then value
+          when 'content' then value
+          when 'markdown_content' then value
+          when 'markdown' then value
+          when 'number' then value.to_i
+          when 'order' then value.to_i
+          when 'boolean' then !value.strip.empty?
+          when 'array' then value.split(array_separator).map(&:strip)
+          # TODO: process values from the default array
+          when 'predefined_array' then value.split(array_separator).map(&:strip)
+          when 'image' then { 'path' => value, 'description' => '' }
+          when 'file' then { 'path' => value, 'description' => '' }
+          when 'geo' then %w[lat lng].zip(value.split(array_separator, 2).map(&:to_f)).to_h
+          when 'belongs_to' then value
+          when 'has_many' then value.split(array_separator).map(&:strip)
+          when 'has_and_belongs_to_many' then value.split(array_separator).map(&:strip)
+          when 'related_posts' then value.split(array_separator).map(&:strip)
+          when 'locales' then value.split(array_separator).map(&:strip)
+          else value
+          end
+      end
+      document.data['uuid'] ||= SecureRandom.uuid
+      document.content = row.delete('content')
+      document.data.merge! row
+      document.save if document.respond_to? :save
+    end
+  end
+  next unless site.respond_to?(:repository)
+  next unless ENV['JEKYLL_ENV'] == 'production'
+  site.repository.commit 'CSV Migration'
+end

data/lib/sutty_migration/jekyll/document_creator.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+require 'jekyll/utils'
+require_relative '../core_extensions'
+module SuttyMigration
+  module Jekyll
+    module DocumentCreator
+      class DocumentExists < ArgumentError; end
+      def self.included(base)
+        base.class_eval do
+          class << self
+            # Creates a new document in a collection or fails if it already
+            # exists.
+            #
+            # @param :site [Jekyll::Site] Jekyll site
+            # @param :date [Time] Post date
+            # @param :title [String] Post title
+            # @param :slug [String] Post slug, slugified title if empty
+            # @param :collection [Jekyll::Collection,String] Collection label or collection
+            # @return [Jekyll::Document] A new document
+            def create(site:, date:, title:, collection:, slug: nil)
+              collection = site.collections[collection] if collection.is_a? String
+              slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
+              basename = "#{date.strftime('%F')}-#{slug}.markdown"
+              path = File.join(collection.directory, basename)
+              raise DocumentExists, "#{path} already exists" if File.exist? path
+              ::Jekyll::Document.new(path, site: site, collection: collection).tap do |document|
+                collection.docs << document
+                document.data['title'] = title
+              end
+            end
+            # Finds a document by its relative path or creates it if it
+            # doesn't exist.  Helpful for idempotent migrations (create or
+            # update actions)
+            #
+            # @param :site [Jekyll::Site] Jekyll site
+            # @param :date [Time] Post date
+            # @param :title [String] Post title
+            # @param :slug [String] Post slug, slugified title if empty
+            # @param :collection [Jekyll::Collection,String] Collection label or collection
+            # @return [Jekyll::Document] The found document or a new one
+            def find_or_create(site:, date:, title:, collection:, slug: nil)
+              collection = site.collections[collection] if collection.is_a? String
+              slug = ::Jekyll::Utils.slugify(title, mode: 'latin') if slug.blank?
+              basename = "#{date.strftime('%F')}-#{slug}.markdown"
+              path = File.join(collection.relative_directory, basename)
+              return find(site: site, relative_path: path) if File.exist?(path)
+              create(site: site, date: date, title: title, slug: slug, collection: collection)
+            end
+            # Finds a document by its relative path
+            #
+            # @param :site [Jekyll::Site]
+            # @param :relative_path [String]
+            # @return [Jekyll::Document,Nil]
+            def find(site:, relative_path:)
+              indexed_documents_by_relative_path(site)[relative_path]
+            end
+            # Index documents by relative path for faster finding
+            #
+            # @param [Jekyll::Site]
+            # @return [Hash]
+            def indexed_documents_by_relative_path(site)
+              @indexed_documents_by_relative_path ||= site.documents.reduce({}) do |idx, doc|
+                idx.tap do |i|
+                  i[doc.relative_path] = doc
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end
+::Jekyll::Document.include SuttyMigration::Jekyll::DocumentCreator

data/lib/sutty_migration/wordpress.rb ADDED Viewed

@@ -0,0 +1,282 @@
+# frozen_string_literal: true
+require 'time'
+require 'securerandom'
+require 'sequel'
+require 'sqlite3'
+require 'json'
+require 'faraday'
+require 'progressbar'
+require 'jekyll/utils'
+module SuttyMigration
+  # Brings posts and attachments from a SQLite3 database.  You can
+  # convert a MySQL/MariaDB dump by using `mysql2sqlite`.
+  #
+  # It doesn't convert them into Jekyll posts but allows you to write a
+  # migration plugin where you can convert data by yourself.  We may add
+  # this feature in the future.
+  class Wordpress
+    attr_reader :site, :prefix, :limit, :url, :wp, :database, :multisite
+    # @param :site [Jekyll::Site] Jekyll site
+    # @param :url [String] Wordpress site URL (must be up for downloads)
+    # @param :database [String] Database path, by default `_data/wordpress.sqlite3`
+    # @param :prefix [String] WP table prefix
+    # @param :limit [Integer] Page length
+    # @param :multisite [Boolean] Site is multisite
+    def initialize(site:, url:, database: nil, prefix: 'wp_', limit: 10, multisite: nil)
+      @site = site
+      @prefix = prefix.freeze
+      @limit = limit.freeze
+      @url = url.freeze
+      @database = database || File.join(site.source, '_data', 'wordpress.sqlite3')
+      @multisite = multisite
+    end
+    # Generate database connections for a multisite WP
+    #
+    # @return [Hash] { "ID" => SuttyMigration::Wordpress }
+    def blogs
+      @blogs ||= wp["select * from #{prefix}blogs"].to_a.map do |blog|
+        url   = "https://#{blog[:domain]}#{blog[:path]}"
+        pfx   = "#{prefix}#{blog[:blog_id]}_" if blog[:blog_id] > 1
+        pfx ||= prefix
+        [blog[:blog_id],
+         blog.merge(db: self.class.new(site: site, url: url, prefix: pfx, database: database, limit: limit,
+                                       multisite: self))]
+      end.to_h
+    end
+    def options
+      @options ||= wp["select option_name, option_value from #{prefix}options"].to_a.map(&:values).to_h.transform_keys(&:to_sym)
+    end
+    # Open the database.
+    #
+    # @return [Sequel::SQLite::Database]
+    def wp
+      @wp ||= Sequel.sqlite(database).tap do |db|
+        db.extension :pagination
+      end
+    end
+    # Download all attachments.  Adds the local path to them.
+    #
+    # @param :progress [Boolean] Toggle progress bar
+    # @return [Nil]
+    def download_all(progress: true)
+      posts(layout: 'attachment').each do |attachment|
+        attachment[:front_matter]['file_path'] = download(url: attachment[:guid], progress: progress)
+      end
+    end
+    # Downloads a file if needed, optionally showing a progress bar.
+    #
+    # @param :url [String] File URL
+    # @param :progress [Boolean] Toggle progress bar
+    # @return [String] File local path
+    def download(url:, progress: true)
+      uri = URI(url)
+      dest = uri.path.sub(%r{\A/}, '')
+      full = File.join(site.source, dest)
+      return dest if File.exist? full
+      ::Jekyll.logger.info "Downloading #{dest}"
+      FileUtils.mkdir_p File.dirname(full)
+      File.open(full, 'w') do |f|
+        if progress
+          head = Faraday.head(url)
+          content_length = head.headers['content-length'].to_i
+          progress = ProgressBar.create(title: File.basename(dest), total: content_length, output: $stderr)
+        end
+        Faraday.get(url) do |req|
+          req.options.on_data = proc do |chunk, downloaded_bytes|
+            f.write chunk
+            if progress
+              progress.progress = downloaded_bytes > content_length ? content_length : downloaded_bytes
+            end
+          end
+        end
+      end
+      dest
+    end
+    # List post types
+    #
+    # @return [Array]
+    def layouts
+      @layouts ||= wp["select distinct post_type from #{prefix}posts"].to_a.map(&:values).flatten
+    end
+    # Finds all posts optionally filtering by post type.  This is not
+    # the official Sequel syntax, but it retrieves metadata as objects
+    # with a single query (and a sub-query).
+    #
+    # @param :layout [String] Layout name, one of #layouts
+    # @param :with_meta [Boolean] Toggle metadata pulling and conversion
+    # @return [Enumerator]
+    def posts(**options)
+      unless options[:layout].blank? || layouts.include?(options[:layout])
+        raise ArgumentError, "#{options[:layout]} must be one of #{layouts.join(', ')}"
+      end
+      wp[post_query(**options)].each_page(limit).to_a.map(&:to_a).flatten.tap do |p|
+        p.map do |post|
+          # Sequel parses dates on localtime
+          post[:date] = ::Jekyll::Utils.parse_date(post[:date]) unless post[:date].blank?
+          unless post[:last_modified_at].blank?
+            post[:last_modified_at] =
+              ::Jekyll::Utils.parse_date(post[:last_modified_at])
+          end
+          post[:front_matter] =
+            begin
+              unless post[:front_matter].blank?
+                JSON.parse(post[:front_matter]).transform_keys(&:to_sym).transform_values do |v|
+                  v.size == 1 ? v.first : v
+                end
+              end
+            rescue JSON::ParserError
+              {}
+            end
+          post[:terms] =
+            begin
+              unless post[:terms].blank?
+                JSON.parse(post[:terms]).transform_keys(&:to_sym).transform_values do |v|
+                  v.size == 1 ? v.first : v
+                end
+              end
+            rescue JSON::ParserError
+              {}
+            end
+        end
+      end
+    end
+    # Brings all users.
+    #
+    # @param :with_meta [Boolean] include metadata
+    # @return [Array]
+    def users(**options)
+      options[:with_meta] = true unless options.key? :with_meta
+      wp[user_query(**options)].each_page(limit).to_a.map(&:to_a).flatten.tap do |u|
+        next unless options[:with_meta]
+        u.map do |user|
+          user[:meta] = JSON.parse(user[:meta]).transform_keys(&:to_sym) unless user[:meta].blank?
+        end
+      end
+    end
+    private
+    # Finds all users.  If it's a multisite WP, we need to check the
+    # main table.
+    #
+    # @param :with_meta [Boolean] include metadata
+    # @return [String]
+    def user_query(with_meta: true)
+      pfx = multisite&.prefix || prefix
+      <<~EOQ
+        select
+          u.*
+          #{', json_group_object(m.meta_key, m.meta_value) as meta' if with_meta}
+        from #{pfx}users as u
+        #{"left join #{pfx}usermeta as m on m.user_id = u.id" if with_meta}
+        group by u.id
+      EOQ
+    end
+    # Query for posts, optionally bringing metadata as JSON objects.
+    #
+    # @param :layout [String] Layout name
+    # @param :with_meta [Boolean] Query metadata
+    # @return [String]
+    def post_query(layout: nil, with_meta: true)
+      <<~EOQ
+        select
+          p.ID as id,
+          strftime('%Y-%m-%d %H:%M:%S UTC', p.post_date_gmt) as date,
+          strftime('%Y-%m-%d %H:%M:%S UTC', p.post_modified_gmt) as last_modified_at,
+          p.post_author as author,
+          p.post_type as layout,
+          p.post_name as slug,
+          p.post_title as title,
+          p.post_content as content,
+          p.post_excerpt as excerpt,
+          p.post_status as status,
+          p.comment_status as comment_status,
+          p.ping_status as ping_status,
+          p.post_password as password,
+          p.to_ping as to_ping,
+          p.pinged as pinged,
+          p.post_content_filtered as content_filtered,
+          p.post_parent as parent,
+          p.guid as guid,
+          p.menu_order as menu_order,
+          p.post_mime_type as mime_type,
+          p.comment_count as comment_count
+          #{', f.front_matter as front_matter' if with_meta}
+          #{', t.terms as terms' if with_meta}
+        from #{prefix}posts as p
+        #{"left join (#{meta_query}) as f on f.post_id = p.ID" if with_meta}
+        #{"left join (#{terms_query}) as t on t.post_id = p.ID" if with_meta}
+        #{"where p.post_type = '#{layout}'" if layout}
+        group by p.ID
+      EOQ
+    end
+    # Recover the post meta as a JSON object with multiple values
+    # converted to arrays
+    #
+    # @return [String]
+    def meta_query
+      <<~EOQ
+        select
+          post_id,
+          json_group_object(meta_key, json(meta_values)) as front_matter
+        from (
+          select
+            post_id,
+            meta_key,
+            json_group_array(meta_value) as meta_values
+          from #{prefix}postmeta
+          group by post_id, meta_key
+        )
+        group by post_id
+      EOQ
+    end
+    # Term taxonomy query
+    #
+    # @param :layout [String] Layout name
+    # @return [String]
+    def terms_query
+      <<~EOQ
+        select
+          post_id,
+          json_group_object(taxonomy, json(terms)) as terms
+        from (
+          select
+            r.object_id as post_id,
+            tt.taxonomy,
+            json_group_array(t.name) as terms
+          from #{prefix}term_relationships as r
+          left join #{prefix}term_taxonomy as tt on tt.term_taxonomy_id = r.term_taxonomy_id
+          left join #{prefix}terms as t on t.term_id = tt.term_id
+          group by r.object_id)
+        group by post_id
+      EOQ
+    end
+  end
+end

data/lib/sutty_migration/wordpress_xml.rb ADDED Viewed

@@ -0,0 +1,154 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require_relative 'wordpress_xml/post'
+require_relative 'wordpress_xml/attachment'
+module SuttyMigration
+  # Understands the XML dump generated by Wordpress and creates
+  # Jekyll::Documents
+  class WordpressXml
+    attr_reader :site, :file, :xml
+    # @param :site [Jekyll::Site] Jekyll site
+    # @param :file [String] File path
+    def initialize(site:, file:)
+      @site = site
+      @file = file
+      @xml  = Nokogiri::XML File.read(file)
+      # Make things easier by removing namespaces.
+      xml.remove_namespaces!
+    end
+    def inspect
+      '#<SuttyMigration::WordpressXml>'
+    end
+    # Site URL
+    #
+    # @return [String]
+    def url
+      @url ||= attribute_value(xml, 'channel > link')
+    end
+    # Site title
+    #
+    # @return [String]
+    def title
+      @title ||= attribute_value(xml, 'channel > title')
+    end
+    # Description
+    #
+    # @return [String]
+    def description
+      @description ||= attribute_value(xml, 'channel > description')
+    end
+    # Language
+    #
+    # TODO: Migrate multilanguage sites.
+    #
+    # @return [String]
+    def language
+      @language ||= attribute_value(xml, 'channel > language')
+    end
+    # Authors with attributes, indexed by author email.
+    #
+    # @return [Hash]
+    def authors
+      @authors ||= xml.css('channel > author').map do |author|
+        {
+          attribute_value(author, 'author_email') => {
+            id: attribute_value(author, 'author_id').to_i,
+            display_name: attribute_value(author, 'author_display_name'),
+            first_name: attribute_value(author, 'author_first_name'),
+            last_name: attribute_value(author, 'author_last_name'),
+            email: attribute_value(author, 'author_email')
+          }
+        }
+      end.reduce(&:merge)
+    end
+    # Categories with attributes, indexed by slug ("nicename")
+    #
+    # @return [Hash]
+    def categories
+      @categories ||= xml.css('channel > category').map do |category|
+        {
+          attribute_value(category, 'category_nicename') => {
+            id: attribute_value(category, 'term_id').to_i,
+            title: attribute_value(category, 'cat_name'),
+            parent: attribute_value(category, 'category_parent'),
+            slug: attribute_value(category, 'category_nicename')
+          }
+        }
+      end.reduce(&:merge)
+    end
+    # Tags with attributes, indexed by slug
+    #
+    # @return [Hash]
+    def tags
+      @tags ||= xml.css('channel > tag').map do |tag|
+        {
+          attribute_value(tag, 'tag_slug') => {
+            id: attribute_value(tag, 'term_id').to_i,
+            title: attribute_value(tag, 'tag_name'),
+            slug: attribute_value(tag, 'tag_slug')
+          }
+        }
+      end.reduce(&:merge)
+    end
+    # Posts, indexed by ID
+    #
+    # @return [Hash]
+    def posts
+      @posts ||= items_find_by('post_type', 'post').map do |post|
+        { attribute_value(post, 'post_id').to_i => Post.new(wordpress: self, item: post) }
+      end.reduce(&:merge)
+    end
+    # Pages, indexed by ID
+    #
+    # @return [Hash]
+    def pages
+      @pages ||= items_find_by('post_type', 'page').map do |page|
+        { attribute_value(page, 'post_id').to_i => Post.new(wordpress: self, item: page) }
+      end.reduce(&:merge)
+    end
+    # Attachments, indexed by ID
+    #
+    # @return [Hash]
+    def attachments
+      @attachments ||= items_find_by('post_type', 'attachment').map do |attachment|
+        { attribute_value(attachment, 'post_id').to_i => Attachment.new(wordpress: self, item: attachment) }
+      end.reduce(&:merge)
+    end
+    # Find items by attribute and value
+    #
+    # @param [String] Attribute name
+    # @param [String] Attribute value
+    # @return [Nokogiri::NodeSet]
+    def items_find_by(attribute, value)
+      xml.css('channel > item').select do |item|
+        attribute_value(item, attribute) == value
+      end
+    end
+    # Get element's attribute value
+    #
+    # @param [Nokogiri::XML::Element]
+    # @param [String]
+    # @return [String]
+    def attribute_value(element, attribute)
+      element.at_css(attribute).text
+    end
+  end
+end

data/lib/sutty_migration/wordpress_xml/attachment.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+require_relative 'post'
+require 'php-serialize'
+require 'faraday'
+require 'progressbar'
+module SuttyMigration
+  class WordpressXml
+    # Represents an attachment or uploaded file.
+    class Attachment < Post
+      # File URL
+      #
+      # @return [String]
+      def attachment_url
+        @attachment_url ||= attribute_value 'attachment_url'
+      end
+      # File destination
+      #
+      # @return [String]
+      def dest
+        @dest ||= URI(attachment_url).path.sub(%r{\A/}, '')
+      end
+      # Metadata, with file information as a Hash
+      #
+      # @return [Hash]
+      def meta
+        super.tap do |m|
+          m['_wp_attachment_metadata'] = PHP.unserialize m['_wp_attachment_metadata']
+        end
+      end
+      # Download the file if it doesn't exist.  Optionally show a
+      # progress bar.
+      #
+      # @param :progress [Boolean]
+      # @return [Boolean]
+      def download(progress: true)
+        return true if File.exist? dest
+        ::Jekyll.logger.info "Downloading #{dest}"
+        FileUtils.mkdir_p File.dirname(dest)
+        File.open(dest, 'w') do |f|
+          if progress
+            head = Faraday.head(attachment_url)
+            content_length = head.headers['content-length'].to_i
+            progress = ProgressBar.create(title: File.basename(dest), total: content_length, output: $stderr)
+          end
+          Faraday.get(attachment_url) do |req|
+            req.options.on_data = proc do |chunk, downloaded_bytes|
+              f.write chunk
+              if progress
+                progress.progress = downloaded_bytes > content_length ? content_length : downloaded_bytes
+              end
+            end
+          end
+        end
+        File.exist? dest
+      end
+    end
+  end
+end

data/lib/sutty_migration/wordpress_xml/post.rb ADDED Viewed

@@ -0,0 +1,171 @@
+# frozen_string_literal: true
+require 'wordpress_formatting/wpautop'
+require 'jekyll/utils'
+module SuttyMigration
+  class WordpressXml
+    # Represents a WordPress post
+    class Post
+      attr_reader :wordpress, :item
+      # @param :wordpress [SuttyMigration::WordpressXml]
+      # @param :item [Nokogiri::XML::Element]
+      def initialize(wordpress:, item:)
+        @wordpress = wordpress
+        @item = item
+      end
+      def inspect
+        "#<SuttyMigration::WordpressXml::Post title=\"#{title}\">"
+      end
+      # Post ID
+      #
+      # @return [Integer]
+      def id
+        @id ||= attribute_value('post_id').to_i
+      end
+      # Permalink. Absolute URL to the post.
+      #
+      # @return [String]
+      def permalink
+        @permalink ||= attribute_value('link').sub(wordpress.url, '')
+      end
+      # Title
+      #
+      # @return [String]
+      def title
+        @title ||= attribute_value('title')
+      end
+      # Description
+      #
+      # @return [String]
+      def description
+        @description ||= attribute_value('description')
+      end
+      # Slug ("post name")
+      #
+      # @return [String]
+      def slug
+        @slug ||= attribute_value('post_name')
+      end
+      # Publication date.
+      #
+      # WordPress can store this date in three different fields and
+      # sometimes they come empty or invalid.
+      #
+      # @return [Time]
+      def date
+        @date ||= %w[pubDate post_date_gmt post_date].map do |date_attr|
+          ::Jekyll::Utils.parse_date attribute_value(date_attr)
+        rescue StandardError
+        end.compact.first
+      end
+      # Modification date.
+      #
+      # @return [Time]
+      def last_modified_at
+        @last_modified_at ||= ::Jekyll::Utils.parse_date attribute_value('post_modified_gmt')
+      end
+      # Content as HTML, with site URL removed.
+      #
+      # @return [String]
+      def content
+        @content ||= WordpressFormatting::Wpautop.wpautop(attribute_value('encoded')).gsub(
+          / (href|src)="#{wordpress.url}/, ' \\1="'
+        )
+      end
+      # Author attributes.
+      #
+      # @return [Hash]
+      def author
+        @author ||= wordpress.authors[attribute_value('creator')]
+      end
+      # Post password.  Use with jekyll-crypto.
+      #
+      # @return [String]
+      def password
+        @password ||= attribute_value 'post_password'
+      end
+      # Tags with attributes.
+      #
+      # @return [Hash]
+      def tags
+        @tags ||= item.css('category').select do |c|
+          c[:domain] == 'post_tag'
+        end.map do |c|
+          wordpress.tags[c[:nicename]]
+        end
+      end
+      # Categories with attributes.
+      #
+      # @return [Hash]
+      def categories
+        @categories ||= item.css('category').select do |c|
+          c[:domain] == 'category'
+        end.map do |c|
+          wordpress.categories[c[:nicename]]
+        end
+      end
+      # Metadata.  Plugins store useful information here.  Duplicated
+      # keys are returned as an Array of values.
+      #
+      # @return [Hash]
+      def meta
+        @meta ||= {}.tap do |meta|
+          item.css('postmeta').each do |m|
+            key = m.css('meta_key').text
+            value = m.css('meta_value').text
+            case meta[key]
+            when nil then meta[key] = value
+            when String then meta[key] = [meta[key], value]
+            when Array then meta[key] << value
+            end
+          end
+        end
+      end
+      # Order.  Higher are sorted on top by jekyll-order.
+      #
+      # @return [Integer]
+      def order
+        @order ||= attribute_value 'is_sticky'
+      end
+      # Publication status
+      #
+      # @return [Boolean]
+      def published?
+        @published ||= attribute_value('status') == 'publish'
+      end
+      # Publication status
+      #
+      # @return [Boolean]
+      def draft?
+        @draft ||= attribute_value('status') == 'draft'
+      end
+      # Get a value from the attribute
+      #
+      # @return [String]
+      def attribute_value(key)
+        item.at_css(key).text
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: sutty-migration
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.3.0
 platform: ruby
 authors:
 - f
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-05-28 00:00:00.000000000 Z
+date: 2021-08-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: jekyll
@@ -52,6 +52,118 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: faraday
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.4'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.4'
+- !ruby/object:Gem::Dependency
+  name: progressbar
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.11'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.11'
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.4'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.4'
+- !ruby/object:Gem::Dependency
+  name: sequel
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.45'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.45'
+- !ruby/object:Gem::Dependency
+  name: wordpress-formatting
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.12.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.12.0
+- !ruby/object:Gem::Dependency
+  name: php-serialize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Takes datafiles and converts them into posts
 email:
 - f@sutty.nl
@@ -64,7 +176,13 @@ files:
 - LICENSE.txt
 - README.md
 - lib/sutty-migration.rb
-- lib/wordpress.rb
+- lib/sutty_migration/core_extensions.rb
+- lib/sutty_migration/data.rb
+- lib/sutty_migration/jekyll/document_creator.rb
+- lib/sutty_migration/wordpress.rb
+- lib/sutty_migration/wordpress_xml.rb
+- lib/sutty_migration/wordpress_xml/attachment.rb
+- lib/sutty_migration/wordpress_xml/post.rb
 homepage: https://0xacab.org/sutty/jekyll/sutty-migration
 licenses:
 - GPL-3.0

data/lib/wordpress.rb DELETED Viewed

@@ -1,174 +0,0 @@
-# frozen_string_literal: true
-# Debug
-require 'pry'
-# Generar UUIDs
-require 'securerandom'
-# Traer resultados de la base de datos
-require 'sequel'
-require 'sqlite3'
-require 'json'
-# Limpieza de contenido
-require 'loofah'
-require 'rails/html/scrubbers'
-require 'rails/html/sanitizer'
-require 'reverse_markdown'
-# Descargar archivos
-require 'faraday'
-class Wordpress
-  attr_reader :site, :prefix, :limit, :url
-  def initialize(site:, url:, prefix: 'wp_', limit: 10)
-    @site = site
-    @prefix = prefix.freeze
-    @limit = limit.freeze
-    @url = url.freeze
-    # Conectarse a la base de datos
-    @wp = Sequel.sqlite(File.join(site.source, '_data', 'wordpress', 'post.sqlite3'))
-    # Las funciones de JSON usan mucha CPU, vamos a traer de a pocos
-    # registros.
-    @wp.extension :pagination
-  end
-  def download(file)
-    dest = 'wp-content/uploads/' + file
-    full = File.join(site.source, dest)
-    return dest if File.exist? full
-    Jekyll.logger.info "Downloading #{dest}"
-    FileUtils.mkdir_p File.dirname(full)
-    File.open(full, 'w') do |f|
-      Faraday.get(url + '/' + dest) do |req|
-        req.options.on_data = Proc.new do |chunk, _|
-          f.write chunk
-        end
-      end
-    end
-    dest
-  end
-  # Obtiene todos los posts opcionalmente filtrando por tipo de post.
-  # No es la forma oficial de Sequel pero no tenemos tiempo de
-  # aprenderla específicamente y además tenemos las opciones en formato
-  # JSON que no estarían soportadas.
-  def posts(layout: nil)
-    query  = post_query.dup
-    query += " where post_type = '#{layout}'" if layout
-    query += ' group by posts.ID'
-    @wp[query].each_page(limit)
-  end
-  def meta(id:)
-    @wp[meta_query(id: id)].to_a
-  end
-  private
-  # Obtener todos los posts, json_objectagg requiere mariadb 10.5
-  def post_query
-    @post_query ||= <<~EOQ
-      select ID as id,
-        post_title as title,
-        post_name as slug,
-        post_type as layout,
-        strftime('%Y-%m-%d', post_date) as date,
-        post_status as status,
-        post_content as content,
-        json_group_object(meta_key, meta_value) as data
-      from #{prefix}posts as posts
-      left join #{prefix}postmeta as frontmatter
-      on posts.ID = frontmatter.post_id
-    EOQ
-  end
-  def meta_query(id:)
-    <<~EOQ
-      SELECT
-        terms.name AS `name`,
-        ttax.taxonomy AS `type`,
-        ttax.parent AS `parent`,
-        ttax.term_id AS `id`
-      FROM
-        #{prefix}terms AS `terms`,
-        #{prefix}term_relationships AS `trels`,
-        #{prefix}term_taxonomy AS `ttax`
-      WHERE
-        trels.object_id = '#{id}' AND
-        trels.term_taxonomy_id = ttax.term_taxonomy_id AND
-        terms.term_id = ttax.term_id
-    EOQ
-  end
-end
-# Antes de generar el sitio vamos a leer todos los artículos desde la
-# base de datos y generarlos localmente.
-Jekyll::Hooks.register :site, :post_read do |site|
-  wp = Wordpress.new(site: site,
-                     url: site.config.dig('wordpress', 'url'),
-                     prefix: site.config.dig('wordpress', 'prefix'))
-  collection = site.collections['posts']
-  ascii_re = Regexp.new("\P{ASCII}").freeze
-  sanitizer = Rails::Html::SafeListSanitizer.new
-  # Traer todas las imágenes cargadas y descargarlas
-  attachments = wp.posts(layout: 'attachment').map do |page|
-    page.map do |attachment|
-      attachment[:data] = JSON.parse(attachment[:data]) unless attachment[:data].nil?
-      file = attachment.dig(:data, '_wp_attached_file')
-      next unless file
-      dest = wp.download(file)
-      # Tener un mapa de IDs y archivos destino
-      [ attachment[:id], dest ]
-    end
-  end.compact.flatten(1).to_h
-  %w[post page].each do |type|
-    wp.posts(layout: type).each do |page|
-      page.each do |post|
-        # Convertir los datos extra en un Hash
-        post[:data] = JSON.parse(post[:data]) unless post[:data].nil?
-        post[:slug] = Jekyll::Utils.slugify(post[:title], mode: 'latin') if post[:slug].empty?
-        post[:meta] = wp.meta id: post[:id]
-        path = File.join(site.source, '_posts', post.slice(:date, :slug).values.join('-') + '.markdown')
-        if File.exist? path
-          Jekyll.logger.info "#{path} ya fue migrado, actualizando"
-          doc = site.documents.find do |d|
-            d['id'] == post[:id]
-          end
-        else
-          # Crear un post nuevo y agregarlo a la colección
-          collection.docs << doc = Jekyll::Document.new(path, site: site, collection: collection)
-          doc.data['uuid'] = SecureRandom.uuid
-        end
-        thumbnail = post.dig(:data, '_thumbnail_id')&.to_i
-        doc.data['layout'] = type
-        doc.data['title'] = post[:title]
-        doc.data['draft'] = post[:status] != 'publish'
-        doc.data['id'] = post[:id]
-        doc.data['date'] = Jekyll::Utils.parse_date(post[:date])
-        doc.data['tags'] = post[:meta].select { |k| k[:type] == 'post_tag' }.map { |k| k[:name] }
-        doc.data['categories'] = post[:meta].select { |k| k[:type] == 'category' }.map { |k| k[:name] }
-        doc.data['image'] = attachments[thumbnail] if thumbnail
-        doc.content = ReverseMarkdown.convert(sanitizer.sanitize(post[:content]))
-        doc.save
-      end
-    end
-  end
-end