bunto-import 2.0.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +21 -21
  3. data/README.markdown +33 -33
  4. data/lib/bunto-import.rb +49 -49
  5. data/lib/bunto-import/importer.rb +26 -26
  6. data/lib/bunto-import/importers.rb +10 -10
  7. data/lib/bunto-import/importers/behance.rb +80 -80
  8. data/lib/bunto-import/importers/blogger.rb +330 -264
  9. data/lib/bunto-import/importers/csv.rb +96 -96
  10. data/lib/bunto-import/importers/drupal6.rb +53 -139
  11. data/lib/bunto-import/importers/drupal7.rb +54 -111
  12. data/lib/bunto-import/importers/drupal_common.rb +157 -0
  13. data/lib/bunto-import/importers/easyblog.rb +96 -96
  14. data/lib/bunto-import/importers/enki.rb +74 -74
  15. data/lib/bunto-import/importers/ghost.rb +68 -68
  16. data/lib/bunto-import/importers/google_reader.rb +64 -64
  17. data/lib/bunto-import/importers/joomla.rb +92 -90
  18. data/lib/bunto-import/importers/joomla3.rb +91 -91
  19. data/lib/bunto-import/importers/jrnl.rb +125 -125
  20. data/lib/bunto-import/importers/marley.rb +72 -72
  21. data/lib/bunto-import/importers/mephisto.rb +99 -99
  22. data/lib/bunto-import/importers/mt.rb +257 -257
  23. data/lib/bunto-import/importers/posterous.rb +130 -130
  24. data/lib/bunto-import/importers/rss.rb +62 -62
  25. data/lib/bunto-import/importers/s9y.rb +60 -60
  26. data/lib/bunto-import/importers/s9y_database.rb +363 -0
  27. data/lib/bunto-import/importers/textpattern.rb +70 -70
  28. data/lib/bunto-import/importers/tumblr.rb +300 -289
  29. data/lib/bunto-import/importers/typo.rb +88 -88
  30. data/lib/bunto-import/importers/wordpress.rb +372 -372
  31. data/lib/bunto-import/importers/wordpressdotcom.rb +207 -207
  32. data/lib/bunto-import/util.rb +76 -76
  33. data/lib/bunto-import/version.rb +3 -3
  34. data/lib/bunto/commands/import.rb +79 -79
  35. metadata +84 -54
@@ -1,207 +1,207 @@
1
- # encoding: UTF-8
2
-
3
- module BuntoImport
4
- module Importers
5
- class WordpressDotCom < Importer
6
- def self.require_deps
7
- BuntoImport.require_with_fallback(%w[
8
- rubygems
9
- fileutils
10
- safe_yaml
11
- hpricot
12
- time
13
- open-uri
14
- open_uri_redirections
15
- ])
16
- end
17
-
18
- def self.specify_options(c)
19
- c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
20
- c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
21
- c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
22
- end
23
-
24
- # Will modify post DOM tree
25
- def self.download_images(title, post_hpricot, assets_folder)
26
- images = (post_hpricot/"img")
27
- if images.length == 0
28
- return
29
- end
30
- puts "Downloading images for " + title
31
- images.each do |i|
32
- uri = i["src"]
33
-
34
- i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
35
- dst = File.join(assets_folder, File.basename(uri))
36
- puts " " + uri
37
- if File.exist?(dst)
38
- puts " Already in cache. Clean assets folder if you want a redownload."
39
- next
40
- end
41
- begin
42
- open(uri, allow_redirections: :safe) {|f|
43
- File.open(dst, "wb") do |out|
44
- out.puts f.read
45
- end
46
- }
47
- puts " OK!"
48
- rescue => e
49
- puts " Error: #{e.message}"
50
- puts e.backtrace.join("\n")
51
- end
52
- end
53
- end
54
-
55
- class Item
56
- def initialize(node)
57
- @node = node
58
- end
59
-
60
- def text_for(path)
61
- @node.at(path).inner_text
62
- end
63
-
64
- def title
65
- @title ||= text_for(:title).strip
66
- end
67
-
68
- def permalink_title
69
- post_name = text_for('wp:post_name')
70
- # Fallback to "prettified" title if post_name is empty (can happen)
71
- @permalink_title ||= if post_name.empty?
72
- WordpressDotCom.sluggify(title)
73
- else
74
- post_name
75
- end
76
- end
77
-
78
- def published_at
79
- if published?
80
- @published_at ||= Time.parse(text_for('wp:post_date'))
81
- end
82
- end
83
-
84
- def status
85
- @status ||= text_for('wp:status')
86
- end
87
-
88
- def post_type
89
- @post_type ||= text_for('wp:post_type')
90
- end
91
-
92
- def file_name
93
- @file_name ||= if published?
94
- "#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
95
- else
96
- "#{permalink_title}.html"
97
- end
98
- end
99
-
100
- def directory_name
101
- @directory_name ||= if !published? && post_type == 'post'
102
- '_drafts'
103
- else
104
- "_#{post_type}s"
105
- end
106
- end
107
-
108
- def published?
109
- @published ||= (status == 'publish')
110
- end
111
-
112
- def excerpt
113
- @excerpt ||= begin
114
- text = Hpricot(text_for('excerpt:encoded')).inner_text
115
- if text.empty?
116
- nil
117
- else
118
- text
119
- end
120
- end
121
- end
122
- end
123
-
124
- def self.process(options)
125
- source = options.fetch('source', "wordpress.xml")
126
- fetch = !options.fetch('no_fetch_images', false)
127
- assets_folder = options.fetch('assets_folder', 'assets')
128
- FileUtils.mkdir_p(assets_folder)
129
-
130
- import_count = Hash.new(0)
131
- doc = Hpricot::XML(File.read(source))
132
- # Fetch authors data from header
133
- authors = Hash[
134
- (doc/:channel/'wp:author').map do |author|
135
- [author.at("wp:author_login").inner_text.strip, {
136
- "login" => author.at("wp:author_login").inner_text.strip,
137
- "email" => author.at("wp:author_email").inner_text,
138
- "display_name" => author.at("wp:author_display_name").inner_text,
139
- "first_name" => author.at("wp:author_first_name").inner_text,
140
- "last_name" => author.at("wp:author_last_name").inner_text
141
- }]
142
- end
143
- ] rescue {}
144
-
145
- (doc/:channel/:item).each do |node|
146
- item = Item.new(node)
147
- categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
148
- tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
149
-
150
- metas = Hash.new
151
- node.search("wp:postmeta").each do |meta|
152
- key = meta.at('wp:meta_key').inner_text
153
- value = meta.at('wp:meta_value').inner_text
154
- metas[key] = value
155
- end
156
-
157
- author_login = item.text_for('dc:creator').strip
158
-
159
- header = {
160
- 'layout' => item.post_type,
161
- 'title' => item.title,
162
- 'date' => item.published_at,
163
- 'type' => item.post_type,
164
- 'published' => item.published?,
165
- 'status' => item.status,
166
- 'categories' => categories,
167
- 'tags' => tags,
168
- 'meta' => metas,
169
- 'author' => authors[author_login]
170
- }
171
-
172
- begin
173
- content = Hpricot(item.text_for('content:encoded'))
174
- header['excerpt'] = item.excerpt if item.excerpt
175
-
176
- if fetch
177
- download_images(item.title, content, assets_folder)
178
- end
179
-
180
- FileUtils.mkdir_p item.directory_name
181
- File.open(File.join(item.directory_name, item.file_name), "w") do |f|
182
- f.puts header.to_yaml
183
- f.puts '---'
184
- f.puts Util.wpautop(content.to_html)
185
- end
186
- rescue => e
187
- puts "Couldn't import post!"
188
- puts "Title: #{item.title}"
189
- puts "Name/Slug: #{item.file_name}\n"
190
- puts "Error: #{e.message}"
191
- next
192
- end
193
-
194
- import_count[item.post_type] += 1
195
- end
196
-
197
- import_count.each do |key, value|
198
- puts "Imported #{value} #{key}s"
199
- end
200
- end
201
-
202
- def self.sluggify(title)
203
- title.gsub(/[^[:alnum:]]+/, '-').downcase
204
- end
205
- end
206
- end
207
- end
1
+ # encoding: UTF-8
2
+
3
+ module BuntoImport
4
+ module Importers
5
+ class WordpressDotCom < Importer
6
+ def self.require_deps
7
+ BuntoImport.require_with_fallback(%w[
8
+ rubygems
9
+ fileutils
10
+ safe_yaml
11
+ hpricot
12
+ time
13
+ open-uri
14
+ open_uri_redirections
15
+ ])
16
+ end
17
+
18
+ def self.specify_options(c)
19
+ c.option 'source', '--source FILE', 'WordPress export XML file (default: "wordpress.xml")'
20
+ c.option 'no_fetch_images', '--no-fetch-images', 'Do not fetch the images referenced in the posts'
21
+ c.option 'assets_folder', '--assets_folder FOLDER', 'Folder where assets such as images will be downloaded to (default: assets)'
22
+ end
23
+
24
+ # Will modify post DOM tree
25
+ def self.download_images(title, post_hpricot, assets_folder)
26
+ images = (post_hpricot/"img")
27
+ if images.length == 0
28
+ return
29
+ end
30
+ puts "Downloading images for " + title
31
+ images.each do |i|
32
+ uri = i["src"]
33
+
34
+ i["src"] = "{{ site.baseurl }}/%s/%s" % [assets_folder, File.basename(uri)]
35
+ dst = File.join(assets_folder, File.basename(uri))
36
+ puts " " + uri
37
+ if File.exist?(dst)
38
+ puts " Already in cache. Clean assets folder if you want a redownload."
39
+ next
40
+ end
41
+ begin
42
+ open(uri, allow_redirections: :safe) {|f|
43
+ File.open(dst, "wb") do |out|
44
+ out.puts f.read
45
+ end
46
+ }
47
+ puts " OK!"
48
+ rescue => e
49
+ puts " Error: #{e.message}"
50
+ puts e.backtrace.join("\n")
51
+ end
52
+ end
53
+ end
54
+
55
+ class Item
56
+ def initialize(node)
57
+ @node = node
58
+ end
59
+
60
+ def text_for(path)
61
+ @node.at(path).inner_text
62
+ end
63
+
64
+ def title
65
+ @title ||= text_for(:title).strip
66
+ end
67
+
68
+ def permalink_title
69
+ post_name = text_for('wp:post_name')
70
+ # Fallback to "prettified" title if post_name is empty (can happen)
71
+ @permalink_title ||= if post_name.empty?
72
+ WordpressDotCom.sluggify(title)
73
+ else
74
+ post_name
75
+ end
76
+ end
77
+
78
+ def published_at
79
+ if published?
80
+ @published_at ||= Time.parse(text_for('wp:post_date'))
81
+ end
82
+ end
83
+
84
+ def status
85
+ @status ||= text_for('wp:status')
86
+ end
87
+
88
+ def post_type
89
+ @post_type ||= text_for('wp:post_type')
90
+ end
91
+
92
+ def file_name
93
+ @file_name ||= if published?
94
+ "#{published_at.strftime('%Y-%m-%d')}-#{permalink_title}.html"
95
+ else
96
+ "#{permalink_title}.html"
97
+ end
98
+ end
99
+
100
+ def directory_name
101
+ @directory_name ||= if !published? && post_type == 'post'
102
+ '_drafts'
103
+ else
104
+ "_#{post_type}s"
105
+ end
106
+ end
107
+
108
+ def published?
109
+ @published ||= (status == 'publish')
110
+ end
111
+
112
+ def excerpt
113
+ @excerpt ||= begin
114
+ text = Hpricot(text_for('excerpt:encoded')).inner_text
115
+ if text.empty?
116
+ nil
117
+ else
118
+ text
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ def self.process(options)
125
+ source = options.fetch('source', "wordpress.xml")
126
+ fetch = !options.fetch('no_fetch_images', false)
127
+ assets_folder = options.fetch('assets_folder', 'assets')
128
+ FileUtils.mkdir_p(assets_folder)
129
+
130
+ import_count = Hash.new(0)
131
+ doc = Hpricot::XML(File.read(source))
132
+ # Fetch authors data from header
133
+ authors = Hash[
134
+ (doc/:channel/'wp:author').map do |author|
135
+ [author.at("wp:author_login").inner_text.strip, {
136
+ "login" => author.at("wp:author_login").inner_text.strip,
137
+ "email" => author.at("wp:author_email").inner_text,
138
+ "display_name" => author.at("wp:author_display_name").inner_text,
139
+ "first_name" => author.at("wp:author_first_name").inner_text,
140
+ "last_name" => author.at("wp:author_last_name").inner_text
141
+ }]
142
+ end
143
+ ] rescue {}
144
+
145
+ (doc/:channel/:item).each do |node|
146
+ item = Item.new(node)
147
+ categories = node.search('category[@domain="category"]').map(&:inner_text).reject{|c| c == 'Uncategorized'}.uniq
148
+ tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
149
+
150
+ metas = Hash.new
151
+ node.search("wp:postmeta").each do |meta|
152
+ key = meta.at('wp:meta_key').inner_text
153
+ value = meta.at('wp:meta_value').inner_text
154
+ metas[key] = value
155
+ end
156
+
157
+ author_login = item.text_for('dc:creator').strip
158
+
159
+ header = {
160
+ 'layout' => item.post_type,
161
+ 'title' => item.title,
162
+ 'date' => item.published_at,
163
+ 'type' => item.post_type,
164
+ 'published' => item.published?,
165
+ 'status' => item.status,
166
+ 'categories' => categories,
167
+ 'tags' => tags,
168
+ 'meta' => metas,
169
+ 'author' => authors[author_login]
170
+ }
171
+
172
+ begin
173
+ content = Hpricot(item.text_for('content:encoded'))
174
+ header['excerpt'] = item.excerpt if item.excerpt
175
+
176
+ if fetch
177
+ download_images(item.title, content, assets_folder)
178
+ end
179
+
180
+ FileUtils.mkdir_p item.directory_name
181
+ File.open(File.join(item.directory_name, item.file_name), "w") do |f|
182
+ f.puts header.to_yaml
183
+ f.puts '---'
184
+ f.puts Util.wpautop(content.to_html)
185
+ end
186
+ rescue => e
187
+ puts "Couldn't import post!"
188
+ puts "Title: #{item.title}"
189
+ puts "Name/Slug: #{item.file_name}\n"
190
+ puts "Error: #{e.message}"
191
+ next
192
+ end
193
+
194
+ import_count[item.post_type] += 1
195
+ end
196
+
197
+ import_count.each do |key, value|
198
+ puts "Imported #{value} #{key}s"
199
+ end
200
+ end
201
+
202
+ def self.sluggify(title)
203
+ title.gsub(/[^[:alnum:]]+/, '-').downcase
204
+ end
205
+ end
206
+ end
207
+ end
@@ -1,76 +1,76 @@
1
- module BuntoImport
2
- module Util
3
-
4
- # Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
5
- #
6
- # A group of regex replaces used to identify text formatted with newlines and
7
- # replace double line-breaks with HTML paragraph tags. The remaining
8
- # line-breaks after conversion become <<br />> tags, unless $br is set to false
9
- #
10
- # @param string pee The text which has to be formatted.
11
- # @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
12
- # @return string Text which has been converted into correct paragraph tags.
13
- #
14
- def self.wpautop(pee, br = true)
15
- return '' if pee.strip == ''
16
-
17
- allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
18
- pre_tags = {}
19
- pee = pee + "\n"
20
-
21
- if pee.include?('<pre')
22
- pee_parts = pee.split('</pre>')
23
- last_pee = pee_parts.pop
24
- pee = ''
25
- pee_parts.each_with_index do |pee_part, i|
26
- start = pee_part.index('<pre')
27
-
28
- unless start
29
- pee += pee_part
30
- next
31
- end
32
-
33
- name = "<pre wp-pre-tag-#{i}></pre>"
34
- pre_tags[name] = pee_part[start..-1] + '</pre>'
35
-
36
- pee += pee_part[0, start] + name
37
- end
38
- pee += last_pee
39
- end
40
-
41
- pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
42
- pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
43
- pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
44
- pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
45
- if pee.include? '<object'
46
- pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
47
- pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
48
- end
49
-
50
- pees = pee.split(/\n\s*\n/).compact
51
- pee = ''
52
- pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
53
- pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
54
- pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
55
- pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
56
- pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
57
- pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
58
- pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
59
- pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
60
- pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
61
- if br
62
- pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
63
- pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
64
- pee = pee.gsub('<WPPreserveNewline />', "\n")
65
- end
66
- pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
67
- pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
68
- pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
69
-
70
- pre_tags.each do |name, value|
71
- pee.gsub!(name, value)
72
- end
73
- pee
74
- end
75
- end
76
- end
1
+ module BuntoImport
2
+ module Util
3
+
4
+ # Ruby translation of wordpress wpautop (see https://core.trac.wordpress.org/browser/trunk/src/wp-includes/formatting.php)
5
+ #
6
+ # A group of regex replaces used to identify text formatted with newlines and
7
+ # replace double line-breaks with HTML paragraph tags. The remaining
8
+ # line-breaks after conversion become <<br />> tags, unless $br is set to false
9
+ #
10
+ # @param string pee The text which has to be formatted.
11
+ # @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
12
+ # @return string Text which has been converted into correct paragraph tags.
13
+ #
14
+ def self.wpautop(pee, br = true)
15
+ return '' if pee.strip == ''
16
+
17
+ allblocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|noscript|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
18
+ pre_tags = {}
19
+ pee = pee + "\n"
20
+
21
+ if pee.include?('<pre')
22
+ pee_parts = pee.split('</pre>')
23
+ last_pee = pee_parts.pop
24
+ pee = ''
25
+ pee_parts.each_with_index do |pee_part, i|
26
+ start = pee_part.index('<pre')
27
+
28
+ unless start
29
+ pee += pee_part
30
+ next
31
+ end
32
+
33
+ name = "<pre wp-pre-tag-#{i}></pre>"
34
+ pre_tags[name] = pee_part[start..-1] + '</pre>'
35
+
36
+ pee += pee_part[0, start] + name
37
+ end
38
+ pee += last_pee
39
+ end
40
+
41
+ pee = pee.gsub(Regexp.new('<br />\s*<br />'), "\n\n")
42
+ pee = pee.gsub(Regexp.new("(<" + allblocks + "[^>]*>)"), "\n\\1")
43
+ pee = pee.gsub(Regexp.new("(</" + allblocks + ">)"), "\\1\n\n")
44
+ pee = pee.gsub("\r\n", "\n").gsub("\r", "\n")
45
+ if pee.include? '<object'
46
+ pee = pee.gsub(Regexp.new('\s*<param([^>]*)>\s*'), "<param\\1>")
47
+ pee = pee.gsub(Regexp.new('\s*</embed>\s*'), '</embed>')
48
+ end
49
+
50
+ pees = pee.split(/\n\s*\n/).compact
51
+ pee = ''
52
+ pees.each { |tinkle| pee += '<p>' + tinkle.chomp("\n") + "</p>\n" }
53
+ pee = pee.gsub(Regexp.new('<p>\s*</p>'), '')
54
+ pee = pee.gsub(Regexp.new('<p>([^<]+)</(div|address|form)>'), "<p>\\1</p></\\2>")
55
+ pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
56
+ pee = pee.gsub(Regexp.new('<p>(<li.+?)</p>'), "\\1")
57
+ pee = pee.gsub(Regexp.new('<p><blockquote([^>]*)>', 'i'), "<blockquote\\1><p>")
58
+ pee = pee.gsub('</blockquote></p>', '</p></blockquote>')
59
+ pee = pee.gsub(Regexp.new('<p>\s*(</?' + allblocks + '[^>]*>)'), "\\1")
60
+ pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*</p>'), "\\1")
61
+ if br
62
+ pee = pee.gsub(Regexp.new('<(script|style).*?</\1>')) { |match| match.gsub("\n", "<WPPreserveNewline />") }
63
+ pee = pee.gsub(Regexp.new('(?<!<br />)\s*\n'), "<br />\n")
64
+ pee = pee.gsub('<WPPreserveNewline />', "\n")
65
+ end
66
+ pee = pee.gsub(Regexp.new('(</?' + allblocks + '[^>]*>)\s*<br />'), "\\1")
67
+ pee = pee.gsub(Regexp.new('<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)'), "\\1")
68
+ pee = pee.gsub(Regexp.new('\n</p>$'), '</p>')
69
+
70
+ pre_tags.each do |name, value|
71
+ pee.gsub!(name, value)
72
+ end
73
+ pee
74
+ end
75
+ end
76
+ end