qipowl 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +7 -0
  2. data/.document +11 -0
  3. data/.gitignore +17 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +3 -0
  6. data/.yardopts +3 -0
  7. data/Gemfile +17 -0
  8. data/LICENSE +20 -0
  9. data/README.md +345 -0
  10. data/Rakefile +21 -0
  11. data/bin/bowler +44 -0
  12. data/config/bowlers/cmd.yaml +3 -0
  13. data/config/bowlers/html.yaml +128 -0
  14. data/config/bowlers/html_supplemental.yaml +3 -0
  15. data/config/bowlers/markdown2html.yaml +23 -0
  16. data/extras/demo/main.rb +34 -0
  17. data/extras/demo/public/apple-touch-icon-114x114-precomposed.png +0 -0
  18. data/extras/demo/public/apple-touch-icon-144x144-precomposed.png +0 -0
  19. data/extras/demo/public/apple-touch-icon-57x57-precomposed.png +0 -0
  20. data/extras/demo/public/apple-touch-icon-72x72-precomposed.png +0 -0
  21. data/extras/demo/public/apple-touch-icon-precomposed.png +0 -0
  22. data/extras/demo/public/apple-touch-icon.png +0 -0
  23. data/extras/demo/public/css/bootstrap-theme.css +384 -0
  24. data/extras/demo/public/css/bootstrap-theme.min.css +1 -0
  25. data/extras/demo/public/css/bootstrap.css +6805 -0
  26. data/extras/demo/public/css/bootstrap.min.css +9 -0
  27. data/extras/demo/public/css/main.css +22 -0
  28. data/extras/demo/public/favicon.ico +0 -0
  29. data/extras/demo/public/fonts/glyphicons-halflings-regular.eot +0 -0
  30. data/extras/demo/public/fonts/glyphicons-halflings-regular.svg +228 -0
  31. data/extras/demo/public/fonts/glyphicons-halflings-regular.ttf +0 -0
  32. data/extras/demo/public/fonts/glyphicons-halflings-regular.woff +0 -0
  33. data/extras/demo/public/html.html +262 -0
  34. data/extras/demo/public/index.html +110 -0
  35. data/extras/demo/public/js/main.js +1 -0
  36. data/extras/demo/public/js/vendor/bootstrap.js +1999 -0
  37. data/extras/demo/public/js/vendor/bootstrap.min.js +6 -0
  38. data/extras/demo/public/js/vendor/jquery-1.10.1.min.js +6 -0
  39. data/extras/demo/public/js/vendor/modernizr-2.6.2-respond-1.1.0.min.js +11 -0
  40. data/extras/drafts/parsing.md +137 -0
  41. data/extras/support/typo +66 -0
  42. data/features/bowler.feature +8 -0
  43. data/features/html.feature +229 -0
  44. data/features/step_definitions/bowler_steps.rb +39 -0
  45. data/features/step_definitions/html_steps.rb +11 -0
  46. data/features/support/env.rb +7 -0
  47. data/images/owl-old.png +0 -0
  48. data/images/owl-old.xcf +0 -0
  49. data/images/owl.png +0 -0
  50. data/images/owl.xcf +0 -0
  51. data/lib/qipowl/bowlers/cmd.rb +26 -0
  52. data/lib/qipowl/bowlers/html.rb +409 -0
  53. data/lib/qipowl/bowlers/htmldoc.rb +268 -0
  54. data/lib/qipowl/bowlers/yaml.rb +63 -0
  55. data/lib/qipowl/core/bowler.rb +251 -0
  56. data/lib/qipowl/core/mapper.rb +92 -0
  57. data/lib/qipowl/core/monkeypatches.rb +168 -0
  58. data/lib/qipowl/core/ruler.rb +106 -0
  59. data/lib/qipowl/utils/hash_recursive_merge.rb +72 -0
  60. data/lib/qipowl/utils/logging.rb +14 -0
  61. data/lib/qipowl/version.rb +3 -0
  62. data/lib/qipowl.rb +50 -0
  63. data/qipowl.gemspec +42 -0
  64. data/qipowl.komodoproject +4 -0
  65. data/spec/bowler_spec.rb +11 -0
  66. data/spec/spec_helper.rb +15 -0
  67. data/spec/string_spec.rb +32 -0
  68. data/spec/yaml_test.yaml +10 -0
  69. metadata +254 -0
@@ -0,0 +1,409 @@
1
+ # encoding: utf-8
2
+
3
+ require 'net/http'
4
+ require 'htmlbeautifier'
5
+
6
+ require_relative '../core/bowler'
7
+ require_relative '../bowlers/htmldoc'
8
+
9
+ module Qipowl
10
+ # Module placeholder for dynamically created bowlers
11
+ module Bowlers
12
+ class Html < Bowler
13
+ ##############################################################################
14
+ ### Default handlers for all the types of markup ###
15
+ ##############################################################################
16
+
17
+ # `:grip` default handler
18
+ # @param [Array] args the words, gained since last call to {#harvest}
19
+ # @return [Array] the array of words with trimmed `grip` tag
20
+ def ∀_grip *args
21
+ text = [*args].join(SEPARATOR)
22
+ mine, rest = text.split("#{__callee__}∎", 2)
23
+ [tagify(∃_grip_tag(__callee__), {:class => ∃_grip(__callee__)[:class]}, mine), rest]
24
+ end
25
+
26
+ # `:alone` default handler
27
+ # @param [Array] args the words, gained since last call to {#harvest}
28
+ # @return [Array] the array of words with prepended `alone` tag
29
+ def ∀_alone *args
30
+ [standalone(∃_alone_tag(__callee__), {:class => ∃_alone(__callee__)[:class]}), args]
31
+ end
32
+
33
+ # `:block` default handler
34
+ # @param [Array] args the words, gained since last call to {#harvest}
35
+ # @param [String] param the text to be places on the same string as
36
+ # opening tag
37
+ # @return [Nil] nil
38
+ def ∀_block param, args
39
+ harvest __callee__,
40
+ tagify(
41
+ ∃_block_tag(__callee__),
42
+ {:class => (param.strip.empty? ? ∃_block(__callee__)[:class] : param.strip)},
43
+ args.hsub(String::HTML_ENTITIES)
44
+ )
45
+ end
46
+
47
+ # `:magnet` default handler
48
+ # @param [Array] args the words, gained since last call to {#harvest}
49
+ # @return [Array] the array of words with trimmed `magnet` tag
50
+ def ∀_magnet *args
51
+ param, *rest = args.flatten
52
+ param = param.unbowl.to_s.prepend("#{__callee__}#{String::NBSP}")
53
+ [tagify(∃_magnet_tag(__callee__), {:class => ∃_magnet(__callee__)[:class]}, param), rest]
54
+ end
55
+
56
+ # `:regular` default handler
57
+ # @param [Array] args the words, gained since last call to {#harvest}
58
+ def ∀_regular *args
59
+ harvest __callee__,
60
+ tagify(
61
+ ∃_regular_tag(canonize(__callee__)),
62
+ {:class => ∃_regular(canonize(__callee__))[:class]},
63
+ args
64
+ )
65
+ end
66
+
67
+ ##############################################################################
68
+ ### Grip :: Specific handlers ###
69
+ ##############################################################################
70
+ # Handler for abbrs.
71
+ # @param [Array] args the words, gained since last call to {#harvest}
72
+ # @return [Array] the array of words with trimmed `abbr` tag
73
+ def † *args
74
+ term, *title = args.flatten
75
+ mine, rest = [*title].join(SEPARATOR).split("#{__callee__}∎", 2)
76
+ [tagify(∃_grip_tag(__callee__), {:title => mine, :class => ∃_grip(__callee__)[:class]}, term), rest]
77
+ end
78
+
79
+ # Handler for anchors.
80
+ # @param [Array] args the words, gained since last call to {#harvest}
81
+ # @return [Array] the array of words with trimmed `a` tag
82
+ def ⚓ *args
83
+ href, *title = args.flatten
84
+ mine, rest = [*title].join(SEPARATOR).split("#{__callee__}∎", 2)
85
+ href = href.unbowl
86
+ [
87
+ case get_href_content(href)
88
+ when :img
89
+ standalone :img, { :src => href, :alt => [*mine].join(SEPARATOR), :class => 'inplace' }
90
+ else
91
+ tagify ∃_grip_tag(__callee__), {:href => href}, mine
92
+ end, rest
93
+ ]
94
+ end
95
+
96
+ ##############################################################################
97
+ ### Alone :: Specific handlers ###
98
+ ##############################################################################
99
+ # `:alone` handler for horizontal rule; it differs from default
100
+ # handler since orphans around must be handled as well.
101
+ # @param [Array] args the words, gained since last call to {#harvest}
102
+ # @return [Nil] nil
103
+ def —— *args
104
+ harvest nil, orphan(args.join(SEPARATOR)) unless args.vacant?
105
+ harvest __callee__, standalone(∃_alone_tag(__callee__))
106
+ end
107
+
108
+ ##############################################################################
109
+ ### Block :: Specific handlers ###
110
+ ##############################################################################
111
+ # `:block` handler for comment (required because comments are
112
+ # formatted in HTML in some specific way.)
113
+ # @param [String] param the text to be places on the same string as opening tag
114
+ # @param [Array] args the words, gained since last call to {#harvest}
115
+ # @return [Nil] nil
116
+ def ✍ *args
117
+ []
118
+ end
119
+
120
+ ##############################################################################
121
+ ### Magnet :: Specific handlers ###
122
+ ##############################################################################
123
+ # `:magnet` handler for reference to Livejournal user.
124
+ # @param [String] param the text to be places on the same string as opening tag
125
+ # @param [Array] args the words, gained since last call to {#harvest}
126
+ # @return [Nil] nil
127
+ def ✎ *args
128
+ param, *rest = args.flatten
129
+ param = param.unbowl
130
+ ljref = "<span style='white-space: nowrap;'><a href='http://#{param}.livejournal.com/profile?mode=full'><img src='http://l-stat.livejournal.com/img/userinfo.gif' alt='[info]' style='border: 0pt none ; vertical-align: bottom; padding-right: 1px;' height='17' width='17'></a><a href='http://#{param}.livejournal.com/?style=mine'><b>#{param}</b></a></span>"
131
+ [ljref, rest]
132
+ end
133
+
134
+ def ☇ *args
135
+ param, *rest = args.flatten
136
+ [tagify(∃_magnet_tag(__callee__), {:name => param.unbowl}, String::ZERO_WIDTH_SPACE), rest]
137
+ end
138
+
139
+ ##############################################################################
140
+ ### Regular :: Specific handlers ###
141
+ ##############################################################################
142
+ # Handler for Youtube video
143
+ # @param [Array] args the words, gained since last call to {#harvest}
144
+ # @return [Nil] nil
145
+ def ✇ *args
146
+ id, *rest = args.flatten
147
+ harvest nil, orphan(rest.join(SEPARATOR)) unless rest.vacant?
148
+ harvest __callee__, %Q(
149
+ <iframe class='youtube' width='560' height='315' src='http://www.youtube.com/embed/#{id.unbowl}'
150
+ frameborder='0' allowfullscreen></iframe>
151
+ )
152
+ end
153
+
154
+ # Handler for standalone pictures and
155
+ # @todo Make it to understand quotes when there is a plain HTML on the other side
156
+ #
157
+ # @param
158
+ # @return [Nil] nil
159
+ def ⚘ *args
160
+ href, *title = args.flatten
161
+ harvest __callee__, %Q(
162
+ <figure>
163
+ <img src='#{href.unbowl}'/>
164
+ <figcaption>
165
+ <p>
166
+ #{title.join(SEPARATOR)}
167
+ </p>
168
+ </figcaption>
169
+ </figure>
170
+ )
171
+ end
172
+
173
+ # `:regular` handler for data lists (required since data list items
174
+ # consist of two tags: `dt` and `dd`.)
175
+ # @param [Array] args the words, gained since last call to {#harvest}
176
+ # @return [Nil] nil
177
+ def ▶ *args
178
+ dt, dd = args.join(SEPARATOR).split(/\s+(?:—)\s+/)
179
+ harvest __callee__, %Q(
180
+ #{tagify :dt, {}, dt}
181
+ #{tagify :dd, {}, dd}
182
+ )
183
+ end
184
+ # Alias for {#▶}, according to YAML rules specifies additional
185
+ # class for the data list `<dl>` tag behind (`dl-horizontal`.)
186
+ alias_method :▷, :▶
187
+
188
+ protected
189
+ # Computes the level of the `:linewide` element by counting
190
+ # preceeding non-breakable spaces. For instance, nested lists
191
+ # are produced by appending `"\u{00A0}"` to the line item
192
+ # DSL tag:
193
+ #
194
+ # li = "• li1 \u{00A0}• nested 1 \u{00A0}• nested 2 • li2"
195
+ #
196
+ # @param [Symbol|String] callee the DSL symbol to get the level information for.
197
+ # @return [Integer] the level requested.
198
+ #
199
+ def level callee
200
+ (callee = callee.to_s).gsub(/#{String::NBSP}/, '').empty? ?
201
+ -1 : (0..callee.length-1).each { |i| break i if callee[i] != String::NBSP }
202
+ end
203
+
204
+ def canonize callee
205
+ callee.to_s.gsub(/^#{String::NBSP}*/, '').to_sym if callee
206
+ end
207
+
208
+ # @see Qipowl::Bowler#harvest
209
+ #
210
+ # Additionally it checks if there was a `:linewide` item, requiring
211
+ # surrounding html element (like `<ul>` aroung several `<li>`s.)
212
+ #
213
+ # @param [Symbol] callee of method
214
+ # @param [String] str to be harvested
215
+ def harvest callee, str
216
+ if callee.nil? || callee != @callee
217
+ level(callee).downto(level(@callee) + 1) { |i|
218
+ str += i.␚ify
219
+ } unless ∃_enclosures(canonize(callee)).nil?
220
+
221
+ if prev = ∃_enclosures(canonize(@callee))
222
+ level(@callee).downto(level(callee) + 1) { |i|
223
+ @yielded.last.sub!(/\A/, opening(prev[:tag], {:class => prev[:class]}))
224
+ @yielded.each { |s| s.gsub!(/#{i.␚ify}/) { closing(prev[:tag]) } }
225
+ }
226
+ end
227
+
228
+ @callee = callee
229
+ end
230
+ super callee, str
231
+ end
232
+
233
+ private
234
+ # Hence we cannot simply declare the DSL for it, we need to handle
235
+ # calls to all the _methods_, starting with those symbols.
236
+ #
237
+ # @param [Symbol] method as specified by caller (`method_missing`.)
238
+ # @param [Array] args as specified by caller (`method_missing`.)
239
+ # @param [Proc] block as specified by caller (`method_missing`.)
240
+ #
241
+ # @return [Array] the array of words
242
+ def special_handler method, *args, &block
243
+ # Sublevel markers, e.g. “ •” is level 2 line-item
244
+ return [method, args].flatten \
245
+ unless level(method) > 0 && self.class::REGULAR_TAGS.keys.include?(canonize(method))
246
+
247
+ self.class.class_eval "alias_method :#{method}, :#{canonize(method)}"
248
+ send method, args, block
249
+ end
250
+
251
+ # Produces html paragraph tag (`<p>`) with class `owl`.
252
+ # @see Qipowl::Bowler#orphan
253
+ # @param str the words, to be put in paragraph tag.
254
+ # @return [String] tagged words.
255
+ def orphan str
256
+ "#{tagify(:p, {}, str.to_s.strip)}"
257
+ end
258
+ # Constructs opening html tag for the input given.
259
+ #
260
+ # To construct `abbr` tag with `title` _Title_ and class _default_:
261
+ #
262
+ # opening :abbr, { :title=>'Title', :class=>'default' }
263
+ #
264
+ # @param [String] tag to produce opening tag string from.
265
+ # @param [Hash] params to be put into opening tag as attributes.
266
+ # @return [String] opening tag for the input given.
267
+ def opening tag, params={}
268
+ attrs = params.inject("") { |m, el| m.prepend " #{el.first}='#{el.last}'" unless el.last.nil? ; m }
269
+ "<#{tag}#{attrs}>"
270
+ end
271
+
272
+ # Constructs closing html tag for the input given.
273
+ #
274
+ # @param [String] tag to produce closing tag string from.
275
+ # @return [String] opening tag for the input given.
276
+ def closing tag
277
+ "</#{tag}>"
278
+ end
279
+
280
+ # (see opening)
281
+ # Acts most like an {#opening} method, but closes an element inplace
282
+ # (used for `hr`, `br`, `img`).
283
+ def standalone tag, params={}
284
+ opening(tag, params).sub('>', '/>')
285
+ end
286
+ # Constructs valid tag for the input given, concatenating
287
+ # opening and closing tags around the text passed in `args`.
288
+ #
289
+ # @param [String] tag to produce html tag string from.
290
+ # @param [Hash] params to be put into opening tag as attributes.
291
+ # @param [Array] args the words, to be tagged around.
292
+ # @return [String] opening tag for the input given.
293
+ def tagify tag, params, *args
294
+ text = [*args].join(SEPARATOR)
295
+ text.vacant? ? '' : "#{opening tag, params}#{text}#{closing tag}"
296
+ end
297
+
298
+
299
+ # Determines content of remote link by href.
300
+ # TODO Make image patterns configurable.
301
+ # @param [String] href link to remote resource
302
+ # @return [Symbol] content type (`:img` or `:text` currently)
303
+ def get_href_content href
304
+ href = href.to_s.unbowl.strip
305
+ if href.end_with?(* %w{png jpg jpeg gif PNG JPG JPEG GIF})
306
+ :img
307
+ elsif /\/\/i\.chzbgr/ =~ href
308
+ :img
309
+ else
310
+ :text
311
+ end
312
+
313
+ # uri = URI(href.to_s.unbowl)
314
+ # Net::HTTP.start(uri.host, uri.port) do |http|
315
+ # http.open_timeout = 1
316
+ # http.read_timeout = 1
317
+ #
318
+ # request = Net::HTTP::Head.new uri
319
+ # response = http.request request
320
+ # case response.to_hash["content-type"].first
321
+ # when /image/ then return :img
322
+ # when /text/ then return :text
323
+ # end
324
+ # end
325
+ # :unknown
326
+ #rescue
327
+ # logger.warn "Unable to determine link [#{href.to_s.unbowl}] type: no internet connection. Reverting to default."
328
+ # :unknown
329
+ end
330
+
331
+ end
332
+ end
333
+ end
334
+ =begin
335
+ # Markup processor for Html output.
336
+ #
337
+ # This class produces HTML from markup as Markdown does.
338
+
339
+ # Amount of unnamed instances of the class (needed for new class name generation)
340
+ @@inst_count = 0
341
+
342
+
343
+ # `:handshake` default handler
344
+ # @param [String] from packed as string operand “before”
345
+ # @param [String] from packed as string operand “after”
346
+ # @return
347
+ def ∈ *args
348
+ from, till, *rest = args.flatten
349
+ tag = @mapping.handshake(__callee__)
350
+ tag = tag[:tag] if Hash === tag
351
+ [tagify(tag, {}, "#{from.unbowl}#{__callee__}#{till.unbowl}".gsub(String::SYMBOL_FOR_SPACE, ' ')), rest]
352
+ end
353
+ alias_method :⊂, :∈
354
+
355
+
356
+
357
+ # @see {Qipowl::Bowler#defreeze}
358
+ #
359
+ # Additionally it checks if tag is a `:block` tag and
360
+ # substitutes all the carriage returns (`$/`) with special symbol
361
+ # {String::CARRIAGE_RETURN} to prevent format damage.
362
+ #
363
+ # @param [String] str to be defreezed
364
+ def defreeze str
365
+ str = super str
366
+ @mapping[:block].each { |tag, htmltag|
367
+ str.gsub!(/(#{tag})(.*?)$(.*?)(#{tag}|\Z)/m) { |m|
368
+ "#{$1}('#{$2}', '#{$3}')\n\n"
369
+ }
370
+ }
371
+ str
372
+ end
373
+
374
+ # @see {Qipowl::Bowler#serveup}
375
+ #
376
+ # Additionally it beatifies the output HTML
377
+ #
378
+ # @param [String] str to be roasted
379
+ def serveup str
380
+ result = ''
381
+ %w(. , : ; ! ? »).map(&:bowl).each { |punct|
382
+ str.gsub!(/(?:\p{Space}|#{String::CARRIAGE_RETURN})*(#{punct})/, '\1')
383
+ # str.gsub!(/(#{punct})(?=\p{Alnum})/, '\1 ')
384
+ }
385
+ %w(«).map(&:bowl).each { |punct|
386
+ str.gsub!(/(#{punct})(?:\p{Space}|#{String::CARRIAGE_RETURN})*/, '\1')
387
+ str.gsub!(/(?<=\p{Alnum})(#{punct})/, ' \1')
388
+ }
389
+ served = super(str)
390
+ begin
391
+ HtmlBeautifier::Beautifier.new(result).scan(served)
392
+ rescue
393
+ logger.error "Was unable to tidyfy resulting HTML. Returning as is."
394
+ result = served
395
+ end
396
+ result
397
+ end
398
+
399
+ end
400
+
401
+ if __FILE__ == $0
402
+
403
+ i = 0
404
+ Dir.glob("#{File.dirname(__FILE__)}/../../../data/octopress-site/source/_posts/**/*.owl").each {|f|
405
+ puts "Processing ##{i += 1}: #{f}"
406
+ Qipowl::Html.parse File.read(f)
407
+ }
408
+ end
409
+ =end
@@ -0,0 +1,268 @@
1
+ # encoding: utf-8
2
+
3
+ require 'nokogiri'
4
+ require 'fileutils'
5
+ require 'yaml'
6
+ require_relative '../core/monkeypatches.rb'
7
+ require_relative '../utils/hash_recursive_merge.rb'
8
+
9
+ module Qipowl
10
+
11
+ class HtmlDoc < Nokogiri::XML::SAX::Document
12
+ attr_reader :qp, :tags
13
+ def initialize mapping
14
+ @mapping = mapping
15
+ @inside = nil
16
+ @collector = {}
17
+ @tags = {:inplace => {}, :linewide => {}}
18
+ @qp = ''
19
+ @level = 0
20
+ end
21
+
22
+ def start_element name, attributes = []
23
+ current_attrs = Hash[attributes]
24
+
25
+ @qp += case name.to_sym
26
+ when :p, :div
27
+ if current_attrs['class']
28
+ @collector[name.to_sym] = "✿_#{name.to_sym}_#{current_attrs['class'].gsub(/\s+/, '_')}".to_sym
29
+ @tags[:linewide][@collector[name.to_sym]] = "#{name.to_sym}†#{current_attrs['class'].gsub(/\s+/, '†')}".to_sym
30
+ "\n\n#{@collector[name.to_sym]} "
31
+ else
32
+ "\n\n"
33
+ end
34
+ when :ul, :ol, :table, :dl
35
+ @inside = name.to_sym
36
+ "\n"
37
+ when :pre then "\n\nΛ\n"
38
+ when :tr then " ┇ "
39
+ when :td then " ┆ "
40
+ when :a
41
+ @inside = :a
42
+ @collector[:href] = current_attrs['href']
43
+ @collector[:name] = current_attrs['name']
44
+ ''
45
+ when :li then (@inside == :ol) ? "◦ " : "• "
46
+ when :b, :strong then "≡"
47
+ when :i, :em, :nobr then "≈"
48
+ when :strike, :del, :s then "─"
49
+ when :small then "↓"
50
+ when :u then "▁"
51
+ when :code, :tt then "λ"
52
+ when :dfn, :abbr, :cite
53
+ @inside = name.to_sym
54
+ @collector[:title] = current_attrs['title']
55
+ when :hr then "\n\n——\n\n"
56
+ when :br then " ⏎\n"
57
+ when :center then "\n— "
58
+ when :dt then "▷ "
59
+ when :dd then " — "
60
+ when :h1 then "§1 "
61
+ when :h2 then "§2 "
62
+ when :h3 then "§3 "
63
+ when :h4 then "§4 "
64
+ when :h5 then "§5 "
65
+ when :h6 then "§6 "
66
+ when :blockquote then "\n\n〉 "
67
+ when :figure
68
+ @inside = :figure
69
+ "\n\n"
70
+ when :figcaption then " "
71
+ when :img then fix_href(current_attrs['src'])
72
+ when :span, :sup
73
+ if current_attrs['class'].nil?
74
+ ''
75
+ else
76
+ @collector[name.to_sym] = "✿_span_#{current_attrs['class'].gsub(/\s+/, '_')}".to_sym
77
+ @tags[:inplace][@collector[name.to_sym]] = "span†#{current_attrs['class'].gsub(/\s+/, '†')}".to_sym
78
+ " #{@collector[name.to_sym]}"
79
+ end
80
+ when :embed, :iframe then "\n\n#{current_attrs['src']}\n\n"
81
+ when :html, :body, :object, :param, :thead, :tbody, :font, :'lj-embed', :'lj-cut'
82
+ ''
83
+ else
84
+ raise "=== Unhandled: #{name} with attrs: [#{current_attrs}]"
85
+ ''
86
+ end
87
+ end
88
+
89
+ def characters str
90
+ case @inside
91
+ when :a, :dfn, :abbr
92
+ @collector[:text] = str
93
+ else
94
+ @qp += str
95
+ end
96
+ end
97
+
98
+ def end_element name
99
+ @qp += case name.to_sym
100
+ when :p, :div
101
+ @collector.delete(name.to_sym)
102
+ "\n\n"
103
+ when :a
104
+ @inside = nil
105
+ (href= @collector.delete(:href)) ?
106
+ " #{(@collector.delete(:text) || '').gsub(/\s+/, "\u{00A0}")}¹#{fix_href href} " :
107
+ "☇ #{@collector.delete(:name)} #{@collector.delete(:text)}"
108
+ when :dfn, :abbr, :cite
109
+ @inside = nil
110
+ result = " #{@collector.delete(:text).gsub(/\s+/, "\u{00A0}")}†#{@collector.delete(:title)}† " rescue ''
111
+ result
112
+ when :ul, :ol, :table, :dl
113
+ @inside = nil
114
+ "\n"
115
+ when :li then "\n"
116
+ when :pre then "\nΛ\n\n"
117
+ when :b, :strong then "≡"
118
+ when :i, :em, :nobr then "≈"
119
+ when :u then "▁"
120
+ when :dd then "\n"
121
+ when :strike, :del, :s then "─"
122
+ when :small then "↓"
123
+ when :code, :tt then "λ"
124
+ when :span, :sup
125
+ "#{@collector.delete(name.to_sym)} "
126
+ when :h1, :h2, :h3, :h4, :h5, :h6 then "\n\n"
127
+ when :blockquote then "\n\n"
128
+ when :figure
129
+ @inside = nil
130
+ "\n\n"
131
+ else
132
+ ''
133
+ end
134
+ end
135
+
136
+ private
137
+ def fix_href href, site = 'http://mudasobwa.ru/'
138
+ href.start_with?('http') ? href : href.gsub(/\A\/+/, '').prepend(site)
139
+ end
140
+ end
141
+
142
+ end
143
+
144
+ if __FILE__ == $0
145
+
146
+ def prepare str
147
+ str.gsub(/&[nm]dash;/, '—') # dashes
148
+ .gsub(/&nbsp;/, ' ') # dashes
149
+ .gsub(/\s+--\s+/, ' — ') # dashes
150
+ .gsub(/^\s*/, '') # leading spaces
151
+ .gsub(/<img src="\/i\/>/, '')
152
+ .gsub(/&trade;/, '™') # other entities
153
+ .gsub(/&copy;/, '©') # other entities
154
+ .gsub(/(1st@1stone.ru|am@secondiary.ru)/, 'am@mudasobwa.ru')
155
+ .gsub(/http:\/\/(www\.)?(secondiary|1stone|matiouchkine.net)\.ru/, 'http://mudasobwa.ru') # obsolete site name
156
+ .gsub(/\[(http[^\]]*)\]/, '\1') # obsolete markdown pics
157
+ .gsub(/<span>\s*<\/span>/, 'λ\1λ') # obsolete markdown pics
158
+ .gsub(/<lj (?:comm|user)="(.*?)">/, '✎ \1') # obsolete markdown pics
159
+ .gsub(/<([^<>]*?@[^<>]*?)>/, '\1') # obsolete markdown pics
160
+ .gsub(/<imgsrc=/, '<img src=') # obsolete markdown pics
161
+ .gsub(/<ahref=/, '<a href=') # obsolete markdown pics
162
+ .gsub(/<\/p>\s*<p>\s*—/, " ⏎\n—") # direct speech
163
+ .gsub(/<br(?:\s*\/?\s*)>\s*<br(?:\s*\/?\s*)>/, "\n\n") # old-fashioned carriage
164
+ .gsub(/<[!]--[^<>]*?-->/, '') # comments
165
+ # .gsub(/([\.,:;!?])(?=\S)/, '\1 ') # fix punctuation
166
+ end
167
+
168
+ def postpare str
169
+ str.gsub(/\R{2,}/, "\n\n")
170
+ .gsub(/\A(\s|⏎)*/, '')
171
+ .gsub(/(\s|⏎)*\Z/, '')
172
+ end
173
+
174
+ tags = {
175
+ :magnet => {:✎ => :lj, :☇ => :a},
176
+ :inplace => {:▁ => :u, :─ => :del},
177
+ :linewide => {:☛ => :twit},
178
+ :block => {:✁ => :cut}
179
+ }
180
+ file = "#{File.dirname(__FILE__)}/../../../data/internals/posts.csv"
181
+ file_errors = "#{File.dirname(__FILE__)}/../../../data/internals/errors.txt"
182
+ FileUtils.rm file_errors if File.exist? file_errors
183
+
184
+ FileUtils.mkdir("#{File.dirname(__FILE__)}/../../../data/site")
185
+ # %w{txt pic ref twt}.each {|d| FileUtils.mkdir("#{File.dirname(__FILE__)}/../../../data/site/#{d}")}
186
+
187
+ puts "Reading #{file} …"
188
+ File.readlines(file).each { |l|
189
+ data = l.split('☢')
190
+ puts "Processing record #{data[0]}"
191
+ begin
192
+ html_doc = Qipowl::HtmlDoc.new nil
193
+ parser = Nokogiri::HTML::SAX::Parser.new(html_doc)
194
+ parser.parse(prepare data[2])
195
+ tags.rmerge! html_doc.tags
196
+ body = postpare(html_doc.qp)
197
+
198
+ body = body.strip if body
199
+
200
+ id = data[0]
201
+ title = data[1].gsub(/'/, "’")
202
+ date = data[3]
203
+ img = data[4]
204
+
205
+ if img && !img.empty? && !img.start_with?('http://')
206
+ img = "http://mudasobwa.ru/i/#{img.gsub(/\A\/+/, '')}"
207
+ end
208
+
209
+ q_doc = Qipowl::HtmlDoc.new nil
210
+ q_parser = Nokogiri::HTML::SAX::Parser.new(q_doc)
211
+ q_parser.parse(prepare data[5])
212
+ tags.rmerge! q_doc.tags
213
+ quote = postpare(q_doc.qp)
214
+
215
+ q_url = data[6]
216
+ type = data[7].to_i # 1 => text, 2 => image, 3 => quote, 4 => twit
217
+ stype = case type
218
+ when 1 then :txt
219
+ when 2 then :pic
220
+ when 3 then :ref
221
+ when 4 then :twt
222
+ else :txt
223
+ end
224
+
225
+ owl_text = %Q(---
226
+ title: '#{title}'
227
+ id: #{id}
228
+ date: '#{date}'
229
+ categories: [#{stype}]
230
+ ---
231
+
232
+ )
233
+ # owl_text << (type == 4 ? "☛ " : "§1 ")
234
+ # owl_text << title
235
+ # owl_text << "\n\n"
236
+
237
+ owl_text << case type
238
+ when 2
239
+ "#{img} #{body.gsub(/\R/, ' ⏎ ')}"
240
+ when 3
241
+ q_ref = q_url[/http:\/\/(.*?)\/|\Z/, 1].split('.').last(2).join('.') rescue nil
242
+ "\n〉 #{quote.strip}\n‒ #{q_ref ? q_ref : q_url}, #{q_url}\n\n#{body}"
243
+ else body
244
+ end
245
+
246
+ fname = "#{date.split.first}-#{title.to_filename}.owl"
247
+ fname = (1..100).each {|i|
248
+ break "#{date.split.first}-#{title.to_filename}-#{i}.owl" \
249
+ unless File.exist?("#{File.dirname(__FILE__)}/../../../data/site/#{date.split.first}-#{title.to_filename}-#{i}.owl")
250
+ } if File.exist?("#{File.dirname(__FILE__)}/../../../data/site/#{fname}")
251
+ File.open("#{File.dirname(__FILE__)}/../../../data/site/#{fname}", 'a') { |f| f.write(owl_text) }
252
+
253
+ rescue Exception => e
254
+ puts '—'*40
255
+ puts 'Error occured'
256
+ puts prepare(data[2])
257
+ puts '—'*40
258
+ puts prepare(data[5])
259
+ puts '—'*40
260
+ raise e
261
+ end
262
+ }
263
+
264
+ File.open("#{File.dirname(__FILE__)}/../../../data/site/rules.yaml", 'a') { |f|
265
+ f.write(tags.to_yaml)
266
+ }
267
+
268
+ end