bunto-import 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +21 -21
- data/README.markdown +33 -33
- data/lib/bunto-import.rb +49 -49
- data/lib/bunto-import/importer.rb +26 -26
- data/lib/bunto-import/importers.rb +10 -10
- data/lib/bunto-import/importers/behance.rb +80 -80
- data/lib/bunto-import/importers/blogger.rb +330 -264
- data/lib/bunto-import/importers/csv.rb +96 -96
- data/lib/bunto-import/importers/drupal6.rb +53 -139
- data/lib/bunto-import/importers/drupal7.rb +54 -111
- data/lib/bunto-import/importers/drupal_common.rb +157 -0
- data/lib/bunto-import/importers/easyblog.rb +96 -96
- data/lib/bunto-import/importers/enki.rb +74 -74
- data/lib/bunto-import/importers/ghost.rb +68 -68
- data/lib/bunto-import/importers/google_reader.rb +64 -64
- data/lib/bunto-import/importers/joomla.rb +92 -90
- data/lib/bunto-import/importers/joomla3.rb +91 -91
- data/lib/bunto-import/importers/jrnl.rb +125 -125
- data/lib/bunto-import/importers/marley.rb +72 -72
- data/lib/bunto-import/importers/mephisto.rb +99 -99
- data/lib/bunto-import/importers/mt.rb +257 -257
- data/lib/bunto-import/importers/posterous.rb +130 -130
- data/lib/bunto-import/importers/rss.rb +62 -62
- data/lib/bunto-import/importers/s9y.rb +60 -60
- data/lib/bunto-import/importers/s9y_database.rb +363 -0
- data/lib/bunto-import/importers/textpattern.rb +70 -70
- data/lib/bunto-import/importers/tumblr.rb +300 -289
- data/lib/bunto-import/importers/typo.rb +88 -88
- data/lib/bunto-import/importers/wordpress.rb +372 -372
- data/lib/bunto-import/importers/wordpressdotcom.rb +207 -207
- data/lib/bunto-import/util.rb +76 -76
- data/lib/bunto-import/version.rb +3 -3
- data/lib/bunto/commands/import.rb +79 -79
- metadata +84 -54
@@ -1,207 +1,207 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
module BuntoImport
|
4
|
-
module Importers
|
5
|
-
class WordpressDotCom < Importer
|
6
|
-
def self.require_deps
|
7
|
-
BuntoImport.require_with_fallback(%w[
|
8
|
-
rubygems
|
9
|
-
fileutils
|
10
|
-
safe_yaml
|
11
|
-
hpricot
|
12
|
-
time
|
13
|
-
open-uri
|
14
|
-
open_uri_redirections
|
15
|
-
])
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.specify_options(c)
|
19
|
-
c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
|
20
|
-
c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
|
21
|
-
c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
|
22
|
-
end
|
23
|
-
|
24
|
-
# Will modify post DOM tree
|
25
|
-
def self.download_images(title, post_hpricot, assets_folder)
|
26
|
-
images = (post_hpricot/"img")
|
27
|
-
if images.length == 0
|
28
|
-
return
|
29
|
-
end
|
30
|
-
puts "Downloading images for " + title
|
31
|
-
images.each do |i|
|
32
|
-
uri = i["src"]
|
33
|
-
|
34
|
-
i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
|
35
|
-
dst = File.join(assets_folder, File.basename(uri))
|
36
|
-
puts " " + uri
|
37
|
-
if File.exist?(dst)
|
38
|
-
puts " Already in cache. Clean assets folder if you want a redownload."
|
39
|
-
next
|
40
|
-
end
|
41
|
-
begin
|
42
|
-
open(uri, allow_redirections: :safe) {|f|
|
43
|
-
File.open(dst, "wb") do |out|
|
44
|
-
out.puts f.read
|
45
|
-
end
|
46
|
-
}
|
47
|
-
puts " OK!"
|
48
|
-
rescue => e
|
49
|
-
puts " Error: #{e.message}"
|
50
|
-
puts e.backtrace.join("\n")
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
class Item
|
56
|
-
def initialize(node)
|
57
|
-
@node = node
|
58
|
-
end
|
59
|
-
|
60
|
-
def text_for(path)
|
61
|
-
@node.at(path).inner_text
|
62
|
-
end
|
63
|
-
|
64
|
-
def title
|
65
|
-
@title ||= text_for(:title).strip
|
66
|
-
end
|
67
|
-
|
68
|
-
def permalink_title
|
69
|
-
post_name = text_for('wp:post_name')
|
70
|
-
# Fallback to "prettified" title if post_name is empty (can happen)
|
71
|
-
@permalink_title ||= if post_name.empty?
|
72
|
-
WordpressDotCom.sluggify(title)
|
73
|
-
else
|
74
|
-
post_name
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def published_at
|
79
|
-
if published?
|
80
|
-
@published_at ||= Time.parse(text_for('wp:post_date'))
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def status
|
85
|
-
@status ||= text_for('wp:status')
|
86
|
-
end
|
87
|
-
|
88
|
-
def post_type
|
89
|
-
@post_type ||= text_for('wp:post_type')
|
90
|
-
end
|
91
|
-
|
92
|
-
def file_name
|
93
|
-
@file_name ||= if published?
|
94
|
-
"#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
|
95
|
-
else
|
96
|
-
"#{permalink_title}.html"
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def directory_name
|
101
|
-
@directory_name ||= if !published? && post_type == 'post'
|
102
|
-
'_drafts'
|
103
|
-
else
|
104
|
-
"_#{post_type}s"
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def published?
|
109
|
-
@published ||= (status == 'publish')
|
110
|
-
end
|
111
|
-
|
112
|
-
def excerpt
|
113
|
-
@excerpt ||= begin
|
114
|
-
text = Hpricot(text_for('excerpt:encoded')).inner_text
|
115
|
-
if text.empty?
|
116
|
-
nil
|
117
|
-
else
|
118
|
-
text
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
def self.process(options)
|
125
|
-
source = options.fetch('source', "wordpress.xml")
|
126
|
-
fetch = !options.fetch('no_fetch_images', false)
|
127
|
-
assets_folder = options.fetch('assets_folder', 'assets')
|
128
|
-
FileUtils.mkdir_p(assets_folder)
|
129
|
-
|
130
|
-
import_count = Hash.new(0)
|
131
|
-
doc = Hpricot::XML(File.read(source))
|
132
|
-
# Fetch authors data from header
|
133
|
-
authors = Hash[
|
134
|
-
(doc/:channel/'wp:author').map do |author|
|
135
|
-
[author.at("wp:author_login").inner_text.strip, {
|
136
|
-
"login" => author.at("wp:author_login").inner_text.strip,
|
137
|
-
"email" => author.at("wp:author_email").inner_text,
|
138
|
-
"display_name" => author.at("wp:author_display_name").inner_text,
|
139
|
-
"first_name" => author.at("wp:author_first_name").inner_text,
|
140
|
-
"last_name" => author.at("wp:author_last_name").inner_text
|
141
|
-
}]
|
142
|
-
end
|
143
|
-
] rescue {}
|
144
|
-
|
145
|
-
(doc/:channel/:item).each do |node|
|
146
|
-
item = Item.new(node)
|
147
|
-
categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
|
148
|
-
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
|
149
|
-
|
150
|
-
metas = Hash.new
|
151
|
-
node.search("wp:postmeta").each do |meta|
|
152
|
-
key = meta.at('wp:meta_key').inner_text
|
153
|
-
value = meta.at('wp:meta_value').inner_text
|
154
|
-
metas[key] = value
|
155
|
-
end
|
156
|
-
|
157
|
-
author_login = item.text_for('dc:creator').strip
|
158
|
-
|
159
|
-
header = {
|
160
|
-
'layout' => item.post_type,
|
161
|
-
'title' => item.title,
|
162
|
-
'date' => item.published_at,
|
163
|
-
'type' => item.post_type,
|
164
|
-
'published' => item.published?,
|
165
|
-
'status' => item.status,
|
166
|
-
'categories' => categories,
|
167
|
-
'tags' => tags,
|
168
|
-
'meta' => metas,
|
169
|
-
'author' => authors[author_login]
|
170
|
-
}
|
171
|
-
|
172
|
-
begin
|
173
|
-
content = Hpricot(item.text_for('content:encoded'))
|
174
|
-
header['excerpt'] = item.excerpt if item.excerpt
|
175
|
-
|
176
|
-
if fetch
|
177
|
-
download_images(item.title, content, assets_folder)
|
178
|
-
end
|
179
|
-
|
180
|
-
FileUtils.mkdir_p item.directory_name
|
181
|
-
File.open(File.join(item.directory_name, item.file_name), "w") do |f|
|
182
|
-
f.puts header.to_yaml
|
183
|
-
f.puts '---'
|
184
|
-
f.puts Util.wpautop(content.to_html)
|
185
|
-
end
|
186
|
-
rescue => e
|
187
|
-
puts "Couldn't import post!"
|
188
|
-
puts "Title: #{item.title}"
|
189
|
-
puts "Name/Slug: #{item.file_name}\n"
|
190
|
-
puts "Error: #{e.message}"
|
191
|
-
next
|
192
|
-
end
|
193
|
-
|
194
|
-
import_count[item.post_type] += 1
|
195
|
-
end
|
196
|
-
|
197
|
-
import_count.each do |key, value|
|
198
|
-
puts "Imported #{value} #{key}s"
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
def self.sluggify(title)
|
203
|
-
title.gsub(/[^[:alnum:]]+/, '-').downcase
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module BuntoImport
|
4
|
+
module Importers
|
5
|
+
class WordpressDotCom < Importer
|
6
|
+
def self.require_deps
|
7
|
+
BuntoImport.require_with_fallback(%w[
|
8
|
+
rubygems
|
9
|
+
fileutils
|
10
|
+
safe_yaml
|
11
|
+
hpricot
|
12
|
+
time
|
13
|
+
open-uri
|
14
|
+
open_uri_redirections
|
15
|
+
])
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.specify_options(c)
|
19
|
+
c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
|
20
|
+
c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
|
21
|
+
c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
|
22
|
+
end
|
23
|
+
|
24
|
+
# Will modify post DOM tree
|
25
|
+
def self.download_images(title, post_hpricot, assets_folder)
|
26
|
+
images = (post_hpricot/"img")
|
27
|
+
if images.length == 0
|
28
|
+
return
|
29
|
+
end
|
30
|
+
puts "Downloading images for " + title
|
31
|
+
images.each do |i|
|
32
|
+
uri = i["src"]
|
33
|
+
|
34
|
+
i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
|
35
|
+
dst = File.join(assets_folder, File.basename(uri))
|
36
|
+
puts " " + uri
|
37
|
+
if File.exist?(dst)
|
38
|
+
puts " Already in cache. Clean assets folder if you want a redownload."
|
39
|
+
next
|
40
|
+
end
|
41
|
+
begin
|
42
|
+
open(uri, allow_redirections: :safe) {|f|
|
43
|
+
File.open(dst, "wb") do |out|
|
44
|
+
out.puts f.read
|
45
|
+
end
|
46
|
+
}
|
47
|
+
puts " OK!"
|
48
|
+
rescue => e
|
49
|
+
puts " Error: #{e.message}"
|
50
|
+
puts e.backtrace.join("\n")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Item
|
56
|
+
def initialize(node)
|
57
|
+
@node = node
|
58
|
+
end
|
59
|
+
|
60
|
+
def text_for(path)
|
61
|
+
@node.at(path).inner_text
|
62
|
+
end
|
63
|
+
|
64
|
+
def title
|
65
|
+
@title ||= text_for(:title).strip
|
66
|
+
end
|
67
|
+
|
68
|
+
def permalink_title
|
69
|
+
post_name = text_for('wp:post_name')
|
70
|
+
# Fallback to "prettified" title if post_name is empty (can happen)
|
71
|
+
@permalink_title ||= if post_name.empty?
|
72
|
+
WordpressDotCom.sluggify(title)
|
73
|
+
else
|
74
|
+
post_name
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def published_at
|
79
|
+
if published?
|
80
|
+
@published_at ||= Time.parse(text_for('wp:post_date'))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def status
|
85
|
+
@status ||= text_for('wp:status')
|
86
|
+
end
|
87
|
+
|
88
|
+
def post_type
|
89
|
+
@post_type ||= text_for('wp:post_type')
|
90
|
+
end
|
91
|
+
|
92
|
+
def file_name
|
93
|
+
@file_name ||= if published?
|
94
|
+
"#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
|
95
|
+
else
|
96
|
+
"#{permalink_title}.html"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def directory_name
|
101
|
+
@directory_name ||= if !published? && post_type == 'post'
|
102
|
+
'_drafts'
|
103
|
+
else
|
104
|
+
"_#{post_type}s"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def published?
|
109
|
+
@published ||= (status == 'publish')
|
110
|
+
end
|
111
|
+
|
112
|
+
def excerpt
|
113
|
+
@excerpt ||= begin
|
114
|
+
text = Hpricot(text_for('excerpt:encoded')).inner_text
|
115
|
+
if text.empty?
|
116
|
+
nil
|
117
|
+
else
|
118
|
+
text
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.process(options)
|
125
|
+
source = options.fetch('source', "wordpress.xml")
|
126
|
+
fetch = !options.fetch('no_fetch_images', false)
|
127
|
+
assets_folder = options.fetch('assets_folder', 'assets')
|
128
|
+
FileUtils.mkdir_p(assets_folder)
|
129
|
+
|
130
|
+
import_count = Hash.new(0)
|
131
|
+
doc = Hpricot::XML(File.read(source))
|
132
|
+
# Fetch authors data from header
|
133
|
+
authors = Hash[
|
134
|
+
(doc/:channel/'wp:author').map do |author|
|
135
|
+
[author.at("wp:author_login").inner_text.strip, {
|
136
|
+
"login" => author.at("wp:author_login").inner_text.strip,
|
137
|
+
"email" => author.at("wp:author_email").inner_text,
|
138
|
+
"display_name" => author.at("wp:author_display_name").inner_text,
|
139
|
+
"first_name" => author.at("wp:author_first_name").inner_text,
|
140
|
+
"last_name" => author.at("wp:author_last_name").inner_text
|
141
|
+
}]
|
142
|
+
end
|
143
|
+
] rescue {}
|
144
|
+
|
145
|
+
(doc/:channel/:item).each do |node|
|
146
|
+
item = Item.new(node)
|
147
|
+
categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
|
148
|
+
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
|
149
|
+
|
150
|
+
metas = Hash.new
|
151
|
+
node.search("wp:postmeta").each do |meta|
|
152
|
+
key = meta.at('wp:meta_key').inner_text
|
153
|
+
value = meta.at('wp:meta_value').inner_text
|
154
|
+
metas[key] = value
|
155
|
+
end
|
156
|
+
|
157
|
+
author_login = item.text_for('dc:creator').strip
|
158
|
+
|
159
|
+
header = {
|
160
|
+
'layout' => item.post_type,
|
161
|
+
'title' => item.title,
|
162
|
+
'date' => item.published_at,
|
163
|
+
'type' => item.post_type,
|
164
|
+
'published' => item.published?,
|
165
|
+
'status' => item.status,
|
166
|
+
'categories' => categories,
|
167
|
+
'tags' => tags,
|
168
|
+
'meta' => metas,
|
169
|
+
'author' => authors[author_login]
|
170
|
+
}
|
171
|
+
|
172
|
+
begin
|
173
|
+
content = Hpricot(item.text_for('content:encoded'))
|
174
|
+
header['excerpt'] = item.excerpt if item.excerpt
|
175
|
+
|
176
|
+
if fetch
|
177
|
+
download_images(item.title, content, assets_folder)
|
178
|
+
end
|
179
|
+
|
180
|
+
FileUtils.mkdir_p item.directory_name
|
181
|
+
File.open(File.join(item.directory_name, item.file_name), "w") do |f|
|
182
|
+
f.puts header.to_yaml
|
183
|
+
f.puts '---'
|
184
|
+
f.puts Util.wpautop(content.to_html)
|
185
|
+
end
|
186
|
+
rescue => e
|
187
|
+
puts "Couldn't import post!"
|
188
|
+
puts "Title: #{item.title}"
|
189
|
+
puts "Name/Slug: #{item.file_name}\n"
|
190
|
+
puts "Error: #{e.message}"
|
191
|
+
next
|
192
|
+
end
|
193
|
+
|
194
|
+
import_count[item.post_type] += 1
|
195
|
+
end
|
196
|
+
|
197
|
+
import_count.each do |key, value|
|
198
|
+
puts "Imported #{value} #{key}s"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def self.sluggify(title)
|
203
|
+
title.gsub(/[^[:alnum:]]+/, '-').downcase
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
data/lib/bunto-import/util.rb
CHANGED
@@ -1,76 +1,76 @@
|
|
1
|
-
module BuntoImport
|
2
|
-
module Util
|
3
|
-
|
4
|
-
# Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
|
5
|
-
#
|
6
|
-
# A group of regex replaces used to identify text formatted with newlines and
|
7
|
-
# replace double line-breaks with HTML paragraph tags. The remaining
|
8
|
-
# line-breaks after conversion become <<br />> tags, unless $br is set to false
|
9
|
-
#
|
10
|
-
# @param string pee The text which has to be formatted.
|
11
|
-
# @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
|
12
|
-
# @return string Text which has been converted into correct paragraph tags.
|
13
|
-
#
|
14
|
-
def self.wpautop(pee, br = true)
|
15
|
-
return '' if pee.strip == ''
|
16
|
-
|
17
|
-
allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
|
18
|
-
pre_tags = {}
|
19
|
-
pee = pee + "\n"
|
20
|
-
|
21
|
-
if pee.include?('<pre')
|
22
|
-
pee_parts = pee.split('</pre>')
|
23
|
-
last_pee = pee_parts.pop
|
24
|
-
pee = ''
|
25
|
-
pee_parts.each_with_index do |pee_part, i|
|
26
|
-
start = pee_part.index('<pre')
|
27
|
-
|
28
|
-
unless start
|
29
|
-
pee += pee_part
|
30
|
-
next
|
31
|
-
end
|
32
|
-
|
33
|
-
name = "<pre wp-pre-tag-#{i}></pre>"
|
34
|
-
pre_tags[name] = pee_part[start..-1] + '</pre>'
|
35
|
-
|
36
|
-
pee += pee_part[0, start] + name
|
37
|
-
end
|
38
|
-
pee += last_pee
|
39
|
-
end
|
40
|
-
|
41
|
-
pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
|
42
|
-
pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
|
43
|
-
pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
|
44
|
-
pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
|
45
|
-
if pee.include? '<object'
|
46
|
-
pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
|
47
|
-
pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
|
48
|
-
end
|
49
|
-
|
50
|
-
pees = pee.split(/\n\s*\n/).compact
|
51
|
-
pee = ''
|
52
|
-
pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
|
53
|
-
pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
|
54
|
-
pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
|
55
|
-
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
56
|
-
pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
|
57
|
-
pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
|
58
|
-
pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
|
59
|
-
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
|
60
|
-
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
61
|
-
if br
|
62
|
-
pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
|
63
|
-
pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
|
64
|
-
pee = pee.gsub('<WPPreserveNewline />', "\n")
|
65
|
-
end
|
66
|
-
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
|
67
|
-
pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
|
68
|
-
pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
|
69
|
-
|
70
|
-
pre_tags.each do |name, value|
|
71
|
-
pee.gsub!(name, value)
|
72
|
-
end
|
73
|
-
pee
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
1
|
+
module BuntoImport
|
2
|
+
module Util
|
3
|
+
|
4
|
+
# Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
|
5
|
+
#
|
6
|
+
# A group of regex replaces used to identify text formatted with newlines and
|
7
|
+
# replace double line-breaks with HTML paragraph tags. The remaining
|
8
|
+
# line-breaks after conversion become <<br />> tags, unless $br is set to false
|
9
|
+
#
|
10
|
+
# @param string pee The text which has to be formatted.
|
11
|
+
# @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
|
12
|
+
# @return string Text which has been converted into correct paragraph tags.
|
13
|
+
#
|
14
|
+
def self.wpautop(pee, br = true)
|
15
|
+
return '' if pee.strip == ''
|
16
|
+
|
17
|
+
allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
|
18
|
+
pre_tags = {}
|
19
|
+
pee = pee + "\n"
|
20
|
+
|
21
|
+
if pee.include?('<pre')
|
22
|
+
pee_parts = pee.split('</pre>')
|
23
|
+
last_pee = pee_parts.pop
|
24
|
+
pee = ''
|
25
|
+
pee_parts.each_with_index do |pee_part, i|
|
26
|
+
start = pee_part.index('<pre')
|
27
|
+
|
28
|
+
unless start
|
29
|
+
pee += pee_part
|
30
|
+
next
|
31
|
+
end
|
32
|
+
|
33
|
+
name = "<pre wp-pre-tag-#{i}></pre>"
|
34
|
+
pre_tags[name] = pee_part[start..-1] + '</pre>'
|
35
|
+
|
36
|
+
pee += pee_part[0, start] + name
|
37
|
+
end
|
38
|
+
pee += last_pee
|
39
|
+
end
|
40
|
+
|
41
|
+
pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
|
42
|
+
pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
|
43
|
+
pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
|
44
|
+
pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
|
45
|
+
if pee.include? '<object'
|
46
|
+
pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
|
47
|
+
pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
|
48
|
+
end
|
49
|
+
|
50
|
+
pees = pee.split(/\n\s*\n/).compact
|
51
|
+
pee = ''
|
52
|
+
pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
|
53
|
+
pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
|
54
|
+
pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
|
55
|
+
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
56
|
+
pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
|
57
|
+
pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
|
58
|
+
pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
|
59
|
+
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
|
60
|
+
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
61
|
+
if br
|
62
|
+
pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
|
63
|
+
pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
|
64
|
+
pee = pee.gsub('<WPPreserveNewline />', "\n")
|
65
|
+
end
|
66
|
+
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
|
67
|
+
pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
|
68
|
+
pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
|
69
|
+
|
70
|
+
pre_tags.each do |name, value|
|
71
|
+
pee.gsub!(name, value)
|
72
|
+
end
|
73
|
+
pee
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|