bunto-import 2.0.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +21 -21
- data/README.markdown +33 -33
- data/lib/bunto-import.rb +49 -49
- data/lib/bunto-import/importer.rb +26 -26
- data/lib/bunto-import/importers.rb +10 -10
- data/lib/bunto-import/importers/behance.rb +80 -80
- data/lib/bunto-import/importers/blogger.rb +330 -264
- data/lib/bunto-import/importers/csv.rb +96 -96
- data/lib/bunto-import/importers/drupal6.rb +53 -139
- data/lib/bunto-import/importers/drupal7.rb +54 -111
- data/lib/bunto-import/importers/drupal_common.rb +157 -0
- data/lib/bunto-import/importers/easyblog.rb +96 -96
- data/lib/bunto-import/importers/enki.rb +74 -74
- data/lib/bunto-import/importers/ghost.rb +68 -68
- data/lib/bunto-import/importers/google_reader.rb +64 -64
- data/lib/bunto-import/importers/joomla.rb +92 -90
- data/lib/bunto-import/importers/joomla3.rb +91 -91
- data/lib/bunto-import/importers/jrnl.rb +125 -125
- data/lib/bunto-import/importers/marley.rb +72 -72
- data/lib/bunto-import/importers/mephisto.rb +99 -99
- data/lib/bunto-import/importers/mt.rb +257 -257
- data/lib/bunto-import/importers/posterous.rb +130 -130
- data/lib/bunto-import/importers/rss.rb +62 -62
- data/lib/bunto-import/importers/s9y.rb +60 -60
- data/lib/bunto-import/importers/s9y_database.rb +363 -0
- data/lib/bunto-import/importers/textpattern.rb +70 -70
- data/lib/bunto-import/importers/tumblr.rb +300 -289
- data/lib/bunto-import/importers/typo.rb +88 -88
- data/lib/bunto-import/importers/wordpress.rb +372 -372
- data/lib/bunto-import/importers/wordpressdotcom.rb +207 -207
- data/lib/bunto-import/util.rb +76 -76
- data/lib/bunto-import/version.rb +3 -3
- data/lib/bunto/commands/import.rb +79 -79
- metadata +84 -54
@@ -1,207 +1,207 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
module BuntoImport
|
4
|
-
module Importers
|
5
|
-
class WordpressDotCom < Importer
|
6
|
-
def self.require_deps
|
7
|
-
BuntoImport.require_with_fallback(%w[
|
8
|
-
rubygems
|
9
|
-
fileutils
|
10
|
-
safe_yaml
|
11
|
-
hpricot
|
12
|
-
time
|
13
|
-
open-uri
|
14
|
-
open_uri_redirections
|
15
|
-
])
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.specify_options(c)
|
19
|
-
c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
|
20
|
-
c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
|
21
|
-
c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
|
22
|
-
end
|
23
|
-
|
24
|
-
# Will modify post DOM tree
|
25
|
-
def self.download_images(title, post_hpricot, assets_folder)
|
26
|
-
images = (post_hpricot/"img")
|
27
|
-
if images.length == 0
|
28
|
-
return
|
29
|
-
end
|
30
|
-
puts "Downloading images for " + title
|
31
|
-
images.each do |i|
|
32
|
-
uri = i["src"]
|
33
|
-
|
34
|
-
i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
|
35
|
-
dst = File.join(assets_folder, File.basename(uri))
|
36
|
-
puts " " + uri
|
37
|
-
if File.exist?(dst)
|
38
|
-
puts " Already in cache. Clean assets folder if you want a redownload."
|
39
|
-
next
|
40
|
-
end
|
41
|
-
begin
|
42
|
-
open(uri, allow_redirections: :safe) {|f|
|
43
|
-
File.open(dst, "wb") do |out|
|
44
|
-
out.puts f.read
|
45
|
-
end
|
46
|
-
}
|
47
|
-
puts " OK!"
|
48
|
-
rescue => e
|
49
|
-
puts " Error: #{e.message}"
|
50
|
-
puts e.backtrace.join("\n")
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
class Item
|
56
|
-
def initialize(node)
|
57
|
-
@node = node
|
58
|
-
end
|
59
|
-
|
60
|
-
def text_for(path)
|
61
|
-
@node.at(path).inner_text
|
62
|
-
end
|
63
|
-
|
64
|
-
def title
|
65
|
-
@title ||= text_for(:title).strip
|
66
|
-
end
|
67
|
-
|
68
|
-
def permalink_title
|
69
|
-
post_name = text_for('wp:post_name')
|
70
|
-
# Fallback to "prettified" title if post_name is empty (can happen)
|
71
|
-
@permalink_title ||= if post_name.empty?
|
72
|
-
WordpressDotCom.sluggify(title)
|
73
|
-
else
|
74
|
-
post_name
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def published_at
|
79
|
-
if published?
|
80
|
-
@published_at ||= Time.parse(text_for('wp:post_date'))
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def status
|
85
|
-
@status ||= text_for('wp:status')
|
86
|
-
end
|
87
|
-
|
88
|
-
def post_type
|
89
|
-
@post_type ||= text_for('wp:post_type')
|
90
|
-
end
|
91
|
-
|
92
|
-
def file_name
|
93
|
-
@file_name ||= if published?
|
94
|
-
"#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
|
95
|
-
else
|
96
|
-
"#{permalink_title}.html"
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def directory_name
|
101
|
-
@directory_name ||= if !published? && post_type == 'post'
|
102
|
-
'_drafts'
|
103
|
-
else
|
104
|
-
"_#{post_type}s"
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def published?
|
109
|
-
@published ||= (status == 'publish')
|
110
|
-
end
|
111
|
-
|
112
|
-
def excerpt
|
113
|
-
@excerpt ||= begin
|
114
|
-
text = Hpricot(text_for('excerpt:encoded')).inner_text
|
115
|
-
if text.empty?
|
116
|
-
nil
|
117
|
-
else
|
118
|
-
text
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
def self.process(options)
|
125
|
-
source = options.fetch('source', "wordpress.xml")
|
126
|
-
fetch = !options.fetch('no_fetch_images', false)
|
127
|
-
assets_folder = options.fetch('assets_folder', 'assets')
|
128
|
-
FileUtils.mkdir_p(assets_folder)
|
129
|
-
|
130
|
-
import_count = Hash.new(0)
|
131
|
-
doc = Hpricot::XML(File.read(source))
|
132
|
-
# Fetch authors data from header
|
133
|
-
authors = Hash[
|
134
|
-
(doc/:channel/'wp:author').map do |author|
|
135
|
-
[author.at("wp:author_login").inner_text.strip, {
|
136
|
-
"login" => author.at("wp:author_login").inner_text.strip,
|
137
|
-
"email" => author.at("wp:author_email").inner_text,
|
138
|
-
"display_name" => author.at("wp:author_display_name").inner_text,
|
139
|
-
"first_name" => author.at("wp:author_first_name").inner_text,
|
140
|
-
"last_name" => author.at("wp:author_last_name").inner_text
|
141
|
-
}]
|
142
|
-
end
|
143
|
-
] rescue {}
|
144
|
-
|
145
|
-
(doc/:channel/:item).each do |node|
|
146
|
-
item = Item.new(node)
|
147
|
-
categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
|
148
|
-
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
|
149
|
-
|
150
|
-
metas = Hash.new
|
151
|
-
node.search("wp:postmeta").each do |meta|
|
152
|
-
key = meta.at('wp:meta_key').inner_text
|
153
|
-
value = meta.at('wp:meta_value').inner_text
|
154
|
-
metas[key] = value
|
155
|
-
end
|
156
|
-
|
157
|
-
author_login = item.text_for('dc:creator').strip
|
158
|
-
|
159
|
-
header = {
|
160
|
-
'layout' => item.post_type,
|
161
|
-
'title' => item.title,
|
162
|
-
'date' => item.published_at,
|
163
|
-
'type' => item.post_type,
|
164
|
-
'published' => item.published?,
|
165
|
-
'status' => item.status,
|
166
|
-
'categories' => categories,
|
167
|
-
'tags' => tags,
|
168
|
-
'meta' => metas,
|
169
|
-
'author' => authors[author_login]
|
170
|
-
}
|
171
|
-
|
172
|
-
begin
|
173
|
-
content = Hpricot(item.text_for('content:encoded'))
|
174
|
-
header['excerpt'] = item.excerpt if item.excerpt
|
175
|
-
|
176
|
-
if fetch
|
177
|
-
download_images(item.title, content, assets_folder)
|
178
|
-
end
|
179
|
-
|
180
|
-
FileUtils.mkdir_p item.directory_name
|
181
|
-
File.open(File.join(item.directory_name, item.file_name), "w") do |f|
|
182
|
-
f.puts header.to_yaml
|
183
|
-
f.puts '---'
|
184
|
-
f.puts Util.wpautop(content.to_html)
|
185
|
-
end
|
186
|
-
rescue => e
|
187
|
-
puts "Couldn't import post!"
|
188
|
-
puts "Title: #{item.title}"
|
189
|
-
puts "Name/Slug: #{item.file_name}\n"
|
190
|
-
puts "Error: #{e.message}"
|
191
|
-
next
|
192
|
-
end
|
193
|
-
|
194
|
-
import_count[item.post_type] += 1
|
195
|
-
end
|
196
|
-
|
197
|
-
import_count.each do |key, value|
|
198
|
-
puts "Imported #{value} #{key}s"
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
def self.sluggify(title)
|
203
|
-
title.gsub(/[^[:alnum:]]+/, '-').downcase
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module BuntoImport
|
4
|
+
module Importers
|
5
|
+
class WordpressDotCom < Importer
|
6
|
+
def self.require_deps
|
7
|
+
BuntoImport.require_with_fallback(%w[
|
8
|
+
rubygems
|
9
|
+
fileutils
|
10
|
+
safe_yaml
|
11
|
+
hpricot
|
12
|
+
time
|
13
|
+
open-uri
|
14
|
+
open_uri_redirections
|
15
|
+
])
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.specify_options(c)
|
19
|
+
c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
|
20
|
+
c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
|
21
|
+
c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
|
22
|
+
end
|
23
|
+
|
24
|
+
# Will modify post DOM tree
|
25
|
+
def self.download_images(title, post_hpricot, assets_folder)
|
26
|
+
images = (post_hpricot/"img")
|
27
|
+
if images.length == 0
|
28
|
+
return
|
29
|
+
end
|
30
|
+
puts "Downloading images for " + title
|
31
|
+
images.each do |i|
|
32
|
+
uri = i["src"]
|
33
|
+
|
34
|
+
i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
|
35
|
+
dst = File.join(assets_folder, File.basename(uri))
|
36
|
+
puts " " + uri
|
37
|
+
if File.exist?(dst)
|
38
|
+
puts " Already in cache. Clean assets folder if you want a redownload."
|
39
|
+
next
|
40
|
+
end
|
41
|
+
begin
|
42
|
+
open(uri, allow_redirections: :safe) {|f|
|
43
|
+
File.open(dst, "wb") do |out|
|
44
|
+
out.puts f.read
|
45
|
+
end
|
46
|
+
}
|
47
|
+
puts " OK!"
|
48
|
+
rescue => e
|
49
|
+
puts " Error: #{e.message}"
|
50
|
+
puts e.backtrace.join("\n")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Item
|
56
|
+
def initialize(node)
|
57
|
+
@node = node
|
58
|
+
end
|
59
|
+
|
60
|
+
def text_for(path)
|
61
|
+
@node.at(path).inner_text
|
62
|
+
end
|
63
|
+
|
64
|
+
def title
|
65
|
+
@title ||= text_for(:title).strip
|
66
|
+
end
|
67
|
+
|
68
|
+
def permalink_title
|
69
|
+
post_name = text_for('wp:post_name')
|
70
|
+
# Fallback to "prettified" title if post_name is empty (can happen)
|
71
|
+
@permalink_title ||= if post_name.empty?
|
72
|
+
WordpressDotCom.sluggify(title)
|
73
|
+
else
|
74
|
+
post_name
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def published_at
|
79
|
+
if published?
|
80
|
+
@published_at ||= Time.parse(text_for('wp:post_date'))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def status
|
85
|
+
@status ||= text_for('wp:status')
|
86
|
+
end
|
87
|
+
|
88
|
+
def post_type
|
89
|
+
@post_type ||= text_for('wp:post_type')
|
90
|
+
end
|
91
|
+
|
92
|
+
def file_name
|
93
|
+
@file_name ||= if published?
|
94
|
+
"#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
|
95
|
+
else
|
96
|
+
"#{permalink_title}.html"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def directory_name
|
101
|
+
@directory_name ||= if !published? && post_type == 'post'
|
102
|
+
'_drafts'
|
103
|
+
else
|
104
|
+
"_#{post_type}s"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def published?
|
109
|
+
@published ||= (status == 'publish')
|
110
|
+
end
|
111
|
+
|
112
|
+
def excerpt
|
113
|
+
@excerpt ||= begin
|
114
|
+
text = Hpricot(text_for('excerpt:encoded')).inner_text
|
115
|
+
if text.empty?
|
116
|
+
nil
|
117
|
+
else
|
118
|
+
text
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.process(options)
|
125
|
+
source = options.fetch('source', "wordpress.xml")
|
126
|
+
fetch = !options.fetch('no_fetch_images', false)
|
127
|
+
assets_folder = options.fetch('assets_folder', 'assets')
|
128
|
+
FileUtils.mkdir_p(assets_folder)
|
129
|
+
|
130
|
+
import_count = Hash.new(0)
|
131
|
+
doc = Hpricot::XML(File.read(source))
|
132
|
+
# Fetch authors data from header
|
133
|
+
authors = Hash[
|
134
|
+
(doc/:channel/'wp:author').map do |author|
|
135
|
+
[author.at("wp:author_login").inner_text.strip, {
|
136
|
+
"login" => author.at("wp:author_login").inner_text.strip,
|
137
|
+
"email" => author.at("wp:author_email").inner_text,
|
138
|
+
"display_name" => author.at("wp:author_display_name").inner_text,
|
139
|
+
"first_name" => author.at("wp:author_first_name").inner_text,
|
140
|
+
"last_name" => author.at("wp:author_last_name").inner_text
|
141
|
+
}]
|
142
|
+
end
|
143
|
+
] rescue {}
|
144
|
+
|
145
|
+
(doc/:channel/:item).each do |node|
|
146
|
+
item = Item.new(node)
|
147
|
+
categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
|
148
|
+
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
|
149
|
+
|
150
|
+
metas = Hash.new
|
151
|
+
node.search("wp:postmeta").each do |meta|
|
152
|
+
key = meta.at('wp:meta_key').inner_text
|
153
|
+
value = meta.at('wp:meta_value').inner_text
|
154
|
+
metas[key] = value
|
155
|
+
end
|
156
|
+
|
157
|
+
author_login = item.text_for('dc:creator').strip
|
158
|
+
|
159
|
+
header = {
|
160
|
+
'layout' => item.post_type,
|
161
|
+
'title' => item.title,
|
162
|
+
'date' => item.published_at,
|
163
|
+
'type' => item.post_type,
|
164
|
+
'published' => item.published?,
|
165
|
+
'status' => item.status,
|
166
|
+
'categories' => categories,
|
167
|
+
'tags' => tags,
|
168
|
+
'meta' => metas,
|
169
|
+
'author' => authors[author_login]
|
170
|
+
}
|
171
|
+
|
172
|
+
begin
|
173
|
+
content = Hpricot(item.text_for('content:encoded'))
|
174
|
+
header['excerpt'] = item.excerpt if item.excerpt
|
175
|
+
|
176
|
+
if fetch
|
177
|
+
download_images(item.title, content, assets_folder)
|
178
|
+
end
|
179
|
+
|
180
|
+
FileUtils.mkdir_p item.directory_name
|
181
|
+
File.open(File.join(item.directory_name, item.file_name), "w") do |f|
|
182
|
+
f.puts header.to_yaml
|
183
|
+
f.puts '---'
|
184
|
+
f.puts Util.wpautop(content.to_html)
|
185
|
+
end
|
186
|
+
rescue => e
|
187
|
+
puts "Couldn't import post!"
|
188
|
+
puts "Title: #{item.title}"
|
189
|
+
puts "Name/Slug: #{item.file_name}\n"
|
190
|
+
puts "Error: #{e.message}"
|
191
|
+
next
|
192
|
+
end
|
193
|
+
|
194
|
+
import_count[item.post_type] += 1
|
195
|
+
end
|
196
|
+
|
197
|
+
import_count.each do |key, value|
|
198
|
+
puts "Imported #{value} #{key}s"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def self.sluggify(title)
|
203
|
+
title.gsub(/[^[:alnum:]]+/, '-').downcase
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
data/lib/bunto-import/util.rb
CHANGED
@@ -1,76 +1,76 @@
|
|
1
|
-
module BuntoImport
|
2
|
-
module Util
|
3
|
-
|
4
|
-
# Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
|
5
|
-
#
|
6
|
-
# A group of regex replaces used to identify text formatted with newlines and
|
7
|
-
# replace double line-breaks with HTML paragraph tags. The remaining
|
8
|
-
# line-breaks after conversion become <<br />> tags, unless $br is set to false
|
9
|
-
#
|
10
|
-
# @param string pee The text which has to be formatted.
|
11
|
-
# @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
|
12
|
-
# @return string Text which has been converted into correct paragraph tags.
|
13
|
-
#
|
14
|
-
def self.wpautop(pee, br = true)
|
15
|
-
return '' if pee.strip == ''
|
16
|
-
|
17
|
-
allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
|
18
|
-
pre_tags = {}
|
19
|
-
pee = pee + "\n"
|
20
|
-
|
21
|
-
if pee.include?('<pre')
|
22
|
-
pee_parts = pee.split('</pre>')
|
23
|
-
last_pee = pee_parts.pop
|
24
|
-
pee = ''
|
25
|
-
pee_parts.each_with_index do |pee_part, i|
|
26
|
-
start = pee_part.index('<pre')
|
27
|
-
|
28
|
-
unless start
|
29
|
-
pee += pee_part
|
30
|
-
next
|
31
|
-
end
|
32
|
-
|
33
|
-
name = "<pre wp-pre-tag-#{i}></pre>"
|
34
|
-
pre_tags[name] = pee_part[start..-1] + '</pre>'
|
35
|
-
|
36
|
-
pee += pee_part[0, start] + name
|
37
|
-
end
|
38
|
-
pee += last_pee
|
39
|
-
end
|
40
|
-
|
41
|
-
pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
|
42
|
-
pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
|
43
|
-
pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
|
44
|
-
pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
|
45
|
-
if pee.include? '<object'
|
46
|
-
pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
|
47
|
-
pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
|
48
|
-
end
|
49
|
-
|
50
|
-
pees = pee.split(/\n\s*\n/).compact
|
51
|
-
pee = ''
|
52
|
-
pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
|
53
|
-
pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
|
54
|
-
pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
|
55
|
-
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
56
|
-
pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
|
57
|
-
pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
|
58
|
-
pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
|
59
|
-
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
|
60
|
-
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
61
|
-
if br
|
62
|
-
pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
|
63
|
-
pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
|
64
|
-
pee = pee.gsub('<WPPreserveNewline />', "\n")
|
65
|
-
end
|
66
|
-
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
|
67
|
-
pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
|
68
|
-
pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
|
69
|
-
|
70
|
-
pre_tags.each do |name, value|
|
71
|
-
pee.gsub!(name, value)
|
72
|
-
end
|
73
|
-
pee
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
1
|
+
module BuntoImport
|
2
|
+
module Util
|
3
|
+
|
4
|
+
# Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
|
5
|
+
#
|
6
|
+
# A group of regex replaces used to identify text formatted with newlines and
|
7
|
+
# replace double line-breaks with HTML paragraph tags. The remaining
|
8
|
+
# line-breaks after conversion become <<br />> tags, unless $br is set to false
|
9
|
+
#
|
10
|
+
# @param string pee The text which has to be formatted.
|
11
|
+
# @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
|
12
|
+
# @return string Text which has been converted into correct paragraph tags.
|
13
|
+
#
|
14
|
+
def self.wpautop(pee, br = true)
|
15
|
+
return '' if pee.strip == ''
|
16
|
+
|
17
|
+
allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
|
18
|
+
pre_tags = {}
|
19
|
+
pee = pee + "\n"
|
20
|
+
|
21
|
+
if pee.include?('<pre')
|
22
|
+
pee_parts = pee.split('</pre>')
|
23
|
+
last_pee = pee_parts.pop
|
24
|
+
pee = ''
|
25
|
+
pee_parts.each_with_index do |pee_part, i|
|
26
|
+
start = pee_part.index('<pre')
|
27
|
+
|
28
|
+
unless start
|
29
|
+
pee += pee_part
|
30
|
+
next
|
31
|
+
end
|
32
|
+
|
33
|
+
name = "<pre wp-pre-tag-#{i}></pre>"
|
34
|
+
pre_tags[name] = pee_part[start..-1] + '</pre>'
|
35
|
+
|
36
|
+
pee += pee_part[0, start] + name
|
37
|
+
end
|
38
|
+
pee += last_pee
|
39
|
+
end
|
40
|
+
|
41
|
+
pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
|
42
|
+
pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
|
43
|
+
pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
|
44
|
+
pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
|
45
|
+
if pee.include? '<object'
|
46
|
+
pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
|
47
|
+
pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
|
48
|
+
end
|
49
|
+
|
50
|
+
pees = pee.split(/\n\s*\n/).compact
|
51
|
+
pee = ''
|
52
|
+
pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
|
53
|
+
pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
|
54
|
+
pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
|
55
|
+
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
56
|
+
pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
|
57
|
+
pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
|
58
|
+
pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
|
59
|
+
pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
|
60
|
+
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
|
61
|
+
if br
|
62
|
+
pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
|
63
|
+
pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
|
64
|
+
pee = pee.gsub('<WPPreserveNewline />', "\n")
|
65
|
+
end
|
66
|
+
pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
|
67
|
+
pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
|
68
|
+
pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
|
69
|
+
|
70
|
+
pre_tags.each do |name, value|
|
71
|
+
pee.gsub!(name, value)
|
72
|
+
end
|
73
|
+
pee
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|