bunto-import 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +21 -21
  3. data/README.markdown +33 -33
  4. data/lib/bunto-import.rb +49 -49
  5. data/lib/bunto-import/importer.rb +26 -26
  6. data/lib/bunto-import/importers.rb +10 -10
  7. data/lib/bunto-import/importers/behance.rb +80 -80
  8. data/lib/bunto-import/importers/blogger.rb +330 -264
  9. data/lib/bunto-import/importers/csv.rb +96 -96
  10. data/lib/bunto-import/importers/drupal6.rb +53 -139
  11. data/lib/bunto-import/importers/drupal7.rb +54 -111
  12. data/lib/bunto-import/importers/drupal_common.rb +157 -0
  13. data/lib/bunto-import/importers/easyblog.rb +96 -96
  14. data/lib/bunto-import/importers/enki.rb +74 -74
  15. data/lib/bunto-import/importers/ghost.rb +68 -68
  16. data/lib/bunto-import/importers/google_reader.rb +64 -64
  17. data/lib/bunto-import/importers/joomla.rb +92 -90
  18. data/lib/bunto-import/importers/joomla3.rb +91 -91
  19. data/lib/bunto-import/importers/jrnl.rb +125 -125
  20. data/lib/bunto-import/importers/marley.rb +72 -72
  21. data/lib/bunto-import/importers/mephisto.rb +99 -99
  22. data/lib/bunto-import/importers/mt.rb +257 -257
  23. data/lib/bunto-import/importers/posterous.rb +130 -130
  24. data/lib/bunto-import/importers/rss.rb +62 -62
  25. data/lib/bunto-import/importers/s9y.rb +60 -60
  26. data/lib/bunto-import/importers/s9y_database.rb +363 -0
  27. data/lib/bunto-import/importers/textpattern.rb +70 -70
  28. data/lib/bunto-import/importers/tumblr.rb +300 -289
  29. data/lib/bunto-import/importers/typo.rb +88 -88
  30. data/lib/bunto-import/importers/wordpress.rb +372 -372
  31. data/lib/bunto-import/importers/wordpressdotcom.rb +207 -207
  32. data/lib/bunto-import/util.rb +76 -76
  33. data/lib/bunto-import/version.rb +3 -3
  34. data/lib/bunto/commands/import.rb +79 -79
  35. metadata +84 -54
@@ -1,207 +1,207 @@
1
- # encoding: UTF-8
2
-
3
- module BuntoImport
4
- module Importers
5
- class WordpressDotCom < Importer
6
- def self.require_deps
7
- BuntoImport.require_with_fallback(%w[
8
- rubygems
9
- fileutils
10
- safe_yaml
11
- hpricot
12
- time
13
- open-uri
14
- open_uri_redirections
15
- ])
16
- end
17
-
18
- def self.specify_options(c)
19
- c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
20
- c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
21
- c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
22
- end
23
-
24
- # Will modify post DOM tree
25
- def self.download_images(title, post_hpricot, assets_folder)
26
- images = (post_hpricot/"img")
27
- if images.length == 0
28
- return
29
- end
30
- puts "Downloading images for " + title
31
- images.each do |i|
32
- uri = i["src"]
33
-
34
- i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
35
- dst = File.join(assets_folder, File.basename(uri))
36
- puts " " + uri
37
- if File.exist?(dst)
38
- puts " Already in cache. Clean assets folder if you want a redownload."
39
- next
40
- end
41
- begin
42
- open(uri, allow_redirections: :safe) {|f|
43
- File.open(dst, "wb") do |out|
44
- out.puts f.read
45
- end
46
- }
47
- puts " OK!"
48
- rescue => e
49
- puts " Error: #{e.message}"
50
- puts e.backtrace.join("\n")
51
- end
52
- end
53
- end
54
-
55
- class Item
56
- def initialize(node)
57
- @node = node
58
- end
59
-
60
- def text_for(path)
61
- @node.at(path).inner_text
62
- end
63
-
64
- def title
65
- @title ||= text_for(:title).strip
66
- end
67
-
68
- def permalink_title
69
- post_name = text_for('wp:post_name')
70
- # Fallback to "prettified" title if post_name is empty (can happen)
71
- @permalink_title ||= if post_name.empty?
72
- WordpressDotCom.sluggify(title)
73
- else
74
- post_name
75
- end
76
- end
77
-
78
- def published_at
79
- if published?
80
- @published_at ||= Time.parse(text_for('wp:post_date'))
81
- end
82
- end
83
-
84
- def status
85
- @status ||= text_for('wp:status')
86
- end
87
-
88
- def post_type
89
- @post_type ||= text_for('wp:post_type')
90
- end
91
-
92
- def file_name
93
- @file_name ||= if published?
94
- "#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
95
- else
96
- "#{permalink_title}.html"
97
- end
98
- end
99
-
100
- def directory_name
101
- @directory_name ||= if !published? && post_type == 'post'
102
- '_drafts'
103
- else
104
- "_#{post_type}s"
105
- end
106
- end
107
-
108
- def published?
109
- @published ||= (status == 'publish')
110
- end
111
-
112
- def excerpt
113
- @excerpt ||= begin
114
- text = Hpricot(text_for('excerpt:encoded')).inner_text
115
- if text.empty?
116
- nil
117
- else
118
- text
119
- end
120
- end
121
- end
122
- end
123
-
124
- def self.process(options)
125
- source = options.fetch('source', "wordpress.xml")
126
- fetch = !options.fetch('no_fetch_images', false)
127
- assets_folder = options.fetch('assets_folder', 'assets')
128
- FileUtils.mkdir_p(assets_folder)
129
-
130
- import_count = Hash.new(0)
131
- doc = Hpricot::XML(File.read(source))
132
- # Fetch authors data from header
133
- authors = Hash[
134
- (doc/:channel/'wp:author').map do |author|
135
- [author.at("wp:author_login").inner_text.strip, {
136
- "login" => author.at("wp:author_login").inner_text.strip,
137
- "email" => author.at("wp:author_email").inner_text,
138
- "display_name" => author.at("wp:author_display_name").inner_text,
139
- "first_name" => author.at("wp:author_first_name").inner_text,
140
- "last_name" => author.at("wp:author_last_name").inner_text
141
- }]
142
- end
143
- ] rescue {}
144
-
145
- (doc/:channel/:item).each do |node|
146
- item = Item.new(node)
147
- categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
148
- tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
149
-
150
- metas = Hash.new
151
- node.search("wp:postmeta").each do |meta|
152
- key = meta.at('wp:meta_key').inner_text
153
- value = meta.at('wp:meta_value').inner_text
154
- metas[key] = value
155
- end
156
-
157
- author_login = item.text_for('dc:creator').strip
158
-
159
- header = {
160
- 'layout' => item.post_type,
161
- 'title' => item.title,
162
- 'date' => item.published_at,
163
- 'type' => item.post_type,
164
- 'published' => item.published?,
165
- 'status' => item.status,
166
- 'categories' => categories,
167
- 'tags' => tags,
168
- 'meta' => metas,
169
- 'author' => authors[author_login]
170
- }
171
-
172
- begin
173
- content = Hpricot(item.text_for('content:encoded'))
174
- header['excerpt'] = item.excerpt if item.excerpt
175
-
176
- if fetch
177
- download_images(item.title, content, assets_folder)
178
- end
179
-
180
- FileUtils.mkdir_p item.directory_name
181
- File.open(File.join(item.directory_name, item.file_name), "w") do |f|
182
- f.puts header.to_yaml
183
- f.puts '---'
184
- f.puts Util.wpautop(content.to_html)
185
- end
186
- rescue => e
187
- puts "Couldn't import post!"
188
- puts "Title: #{item.title}"
189
- puts "Name/Slug: #{item.file_name}\n"
190
- puts "Error: #{e.message}"
191
- next
192
- end
193
-
194
- import_count[item.post_type] += 1
195
- end
196
-
197
- import_count.each do |key, value|
198
- puts "Imported #{value} #{key}s"
199
- end
200
- end
201
-
202
- def self.sluggify(title)
203
- title.gsub(/[^[:alnum:]]+/, '-').downcase
204
- end
205
- end
206
- end
207
- end
1
+ # encoding: UTF-8
2
+
3
+ module BuntoImport
4
+ module Importers
5
+ class WordpressDotCom < Importer
6
+ def self.require_deps
7
+ BuntoImport.require_with_fallback(%w[
8
+ rubygems
9
+ fileutils
10
+ safe_yaml
11
+ hpricot
12
+ time
13
+ open-uri
14
+ open_uri_redirections
15
+ ])
16
+ end
17
+
18
+ def self.specify_options(c)
19
+ c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
20
+ c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
21
+ c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
22
+ end
23
+
24
+ # Will modify post DOM tree
25
+ def self.download_images(title, post_hpricot, assets_folder)
26
+ images = (post_hpricot/"img")
27
+ if images.length == 0
28
+ return
29
+ end
30
+ puts "Downloading images for " + title
31
+ images.each do |i|
32
+ uri = i["src"]
33
+
34
+ i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
35
+ dst = File.join(assets_folder, File.basename(uri))
36
+ puts " " + uri
37
+ if File.exist?(dst)
38
+ puts " Already in cache. Clean assets folder if you want a redownload."
39
+ next
40
+ end
41
+ begin
42
+ open(uri, allow_redirections: :safe) {|f|
43
+ File.open(dst, "wb") do |out|
44
+ out.puts f.read
45
+ end
46
+ }
47
+ puts " OK!"
48
+ rescue => e
49
+ puts " Error: #{e.message}"
50
+ puts e.backtrace.join("\n")
51
+ end
52
+ end
53
+ end
54
+
55
+ class Item
56
+ def initialize(node)
57
+ @node = node
58
+ end
59
+
60
+ def text_for(path)
61
+ @node.at(path).inner_text
62
+ end
63
+
64
+ def title
65
+ @title ||= text_for(:title).strip
66
+ end
67
+
68
+ def permalink_title
69
+ post_name = text_for('wp:post_name')
70
+ # Fallback to "prettified" title if post_name is empty (can happen)
71
+ @permalink_title ||= if post_name.empty?
72
+ WordpressDotCom.sluggify(title)
73
+ else
74
+ post_name
75
+ end
76
+ end
77
+
78
+ def published_at
79
+ if published?
80
+ @published_at ||= Time.parse(text_for('wp:post_date'))
81
+ end
82
+ end
83
+
84
+ def status
85
+ @status ||= text_for('wp:status')
86
+ end
87
+
88
+ def post_type
89
+ @post_type ||= text_for('wp:post_type')
90
+ end
91
+
92
+ def file_name
93
+ @file_name ||= if published?
94
+ "#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
95
+ else
96
+ "#{permalink_title}.html"
97
+ end
98
+ end
99
+
100
+ def directory_name
101
+ @directory_name ||= if !published? && post_type == 'post'
102
+ '_drafts'
103
+ else
104
+ "_#{post_type}s"
105
+ end
106
+ end
107
+
108
+ def published?
109
+ @published ||= (status == 'publish')
110
+ end
111
+
112
+ def excerpt
113
+ @excerpt ||= begin
114
+ text = Hpricot(text_for('excerpt:encoded')).inner_text
115
+ if text.empty?
116
+ nil
117
+ else
118
+ text
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ def self.process(options)
125
+ source = options.fetch('source', "wordpress.xml")
126
+ fetch = !options.fetch('no_fetch_images', false)
127
+ assets_folder = options.fetch('assets_folder', 'assets')
128
+ FileUtils.mkdir_p(assets_folder)
129
+
130
+ import_count = Hash.new(0)
131
+ doc = Hpricot::XML(File.read(source))
132
+ # Fetch authors data from header
133
+ authors = Hash[
134
+ (doc/:channel/'wp:author').map do |author|
135
+ [author.at("wp:author_login").inner_text.strip, {
136
+ "login" => author.at("wp:author_login").inner_text.strip,
137
+ "email" => author.at("wp:author_email").inner_text,
138
+ "display_name" => author.at("wp:author_display_name").inner_text,
139
+ "first_name" => author.at("wp:author_first_name").inner_text,
140
+ "last_name" => author.at("wp:author_last_name").inner_text
141
+ }]
142
+ end
143
+ ] rescue {}
144
+
145
+ (doc/:channel/:item).each do |node|
146
+ item = Item.new(node)
147
+ categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
148
+ tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
149
+
150
+ metas = Hash.new
151
+ node.search("wp:postmeta").each do |meta|
152
+ key = meta.at('wp:meta_key').inner_text
153
+ value = meta.at('wp:meta_value').inner_text
154
+ metas[key] = value
155
+ end
156
+
157
+ author_login = item.text_for('dc:creator').strip
158
+
159
+ header = {
160
+ 'layout' => item.post_type,
161
+ 'title' => item.title,
162
+ 'date' => item.published_at,
163
+ 'type' => item.post_type,
164
+ 'published' => item.published?,
165
+ 'status' => item.status,
166
+ 'categories' => categories,
167
+ 'tags' => tags,
168
+ 'meta' => metas,
169
+ 'author' => authors[author_login]
170
+ }
171
+
172
+ begin
173
+ content = Hpricot(item.text_for('content:encoded'))
174
+ header['excerpt'] = item.excerpt if item.excerpt
175
+
176
+ if fetch
177
+ download_images(item.title, content, assets_folder)
178
+ end
179
+
180
+ FileUtils.mkdir_p item.directory_name
181
+ File.open(File.join(item.directory_name, item.file_name), "w") do |f|
182
+ f.puts header.to_yaml
183
+ f.puts '---'
184
+ f.puts Util.wpautop(content.to_html)
185
+ end
186
+ rescue => e
187
+ puts "Couldn't import post!"
188
+ puts "Title: #{item.title}"
189
+ puts "Name/Slug: #{item.file_name}\n"
190
+ puts "Error: #{e.message}"
191
+ next
192
+ end
193
+
194
+ import_count[item.post_type] += 1
195
+ end
196
+
197
+ import_count.each do |key, value|
198
+ puts "Imported #{value} #{key}s"
199
+ end
200
+ end
201
+
202
+ def self.sluggify(title)
203
+ title.gsub(/[^[:alnum:]]+/, '-').downcase
204
+ end
205
+ end
206
+ end
207
+ end
@@ -1,76 +1,76 @@
1
- module BuntoImport
2
- module Util
3
-
4
- # Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
5
- #
6
- # A group of regex replaces used to identify text formatted with newlines and
7
- # replace double line-breaks with HTML paragraph tags. The remaining
8
- # line-breaks after conversion become <<br />> tags, unless $br is set to false
9
- #
10
- # @param string pee The text which has to be formatted.
11
- # @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
12
- # @return string Text which has been converted into correct paragraph tags.
13
- #
14
- def self.wpautop(pee, br = true)
15
- return '' if pee.strip == ''
16
-
17
- allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
18
- pre_tags = {}
19
- pee = pee + "\n"
20
-
21
- if pee.include?('<pre')
22
- pee_parts = pee.split('</pre>')
23
- last_pee = pee_parts.pop
24
- pee = ''
25
- pee_parts.each_with_index do |pee_part, i|
26
- start = pee_part.index('<pre')
27
-
28
- unless start
29
- pee += pee_part
30
- next
31
- end
32
-
33
- name = "<pre wp-pre-tag-#{i}></pre>"
34
- pre_tags[name] = pee_part[start..-1] + '</pre>'
35
-
36
- pee += pee_part[0, start] + name
37
- end
38
- pee += last_pee
39
- end
40
-
41
- pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
42
- pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
43
- pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
44
- pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
45
- if pee.include? '<object'
46
- pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
47
- pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
48
- end
49
-
50
- pees = pee.split(/\n\s*\n/).compact
51
- pee = ''
52
- pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
53
- pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
54
- pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
55
- pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
56
- pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
57
- pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
58
- pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
59
- pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
60
- pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
61
- if br
62
- pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
63
- pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
64
- pee = pee.gsub('<WPPreserveNewline />', "\n")
65
- end
66
- pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
67
- pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
68
- pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
69
-
70
- pre_tags.each do |name, value|
71
- pee.gsub!(name, value)
72
- end
73
- pee
74
- end
75
- end
76
- end
1
+ module BuntoImport
2
+ module Util
3
+
4
+ # Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
5
+ #
6
+ # A group of regex replaces used to identify text formatted with newlines and
7
+ # replace double line-breaks with HTML paragraph tags. The remaining
8
+ # line-breaks after conversion become <<br />> tags, unless $br is set to false
9
+ #
10
+ # @param string pee The text which has to be formatted.
11
+ # @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
12
+ # @return string Text which has been converted into correct paragraph tags.
13
+ #
14
+ def self.wpautop(pee, br = true)
15
+ return '' if pee.strip == ''
16
+
17
+ allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
18
+ pre_tags = {}
19
+ pee = pee + "\n"
20
+
21
+ if pee.include?('<pre')
22
+ pee_parts = pee.split('</pre>')
23
+ last_pee = pee_parts.pop
24
+ pee = ''
25
+ pee_parts.each_with_index do |pee_part, i|
26
+ start = pee_part.index('<pre')
27
+
28
+ unless start
29
+ pee += pee_part
30
+ next
31
+ end
32
+
33
+ name = "<pre wp-pre-tag-#{i}></pre>"
34
+ pre_tags[name] = pee_part[start..-1] + '</pre>'
35
+
36
+ pee += pee_part[0, start] + name
37
+ end
38
+ pee += last_pee
39
+ end
40
+
41
+ pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
42
+ pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
43
+ pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
44
+ pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
45
+ if pee.include? '<object'
46
+ pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
47
+ pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
48
+ end
49
+
50
+ pees = pee.split(/\n\s*\n/).compact
51
+ pee = ''
52
+ pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
53
+ pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
54
+ pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
55
+ pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
56
+ pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
57
+ pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
58
+ pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
59
+ pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
60
+ pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
61
+ if br
62
+ pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
63
+ pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
64
+ pee = pee.gsub('<WPPreserveNewline />', "\n")
65
+ end
66
+ pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
67
+ pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
68
+ pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
69
+
70
+ pre_tags.each do |name, value|
71
+ pee.gsub!(name, value)
72
+ end
73
+ pee
74
+ end
75
+ end
76
+ end