qipowl 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +11 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/.yardopts +3 -0
- data/Gemfile +17 -0
- data/LICENSE +20 -0
- data/README.md +345 -0
- data/Rakefile +21 -0
- data/bin/bowler +44 -0
- data/config/bowlers/cmd.yaml +3 -0
- data/config/bowlers/html.yaml +128 -0
- data/config/bowlers/html_supplemental.yaml +3 -0
- data/config/bowlers/markdown2html.yaml +23 -0
- data/extras/demo/main.rb +34 -0
- data/extras/demo/public/apple-touch-icon-114x114-precomposed.png +0 -0
- data/extras/demo/public/apple-touch-icon-144x144-precomposed.png +0 -0
- data/extras/demo/public/apple-touch-icon-57x57-precomposed.png +0 -0
- data/extras/demo/public/apple-touch-icon-72x72-precomposed.png +0 -0
- data/extras/demo/public/apple-touch-icon-precomposed.png +0 -0
- data/extras/demo/public/apple-touch-icon.png +0 -0
- data/extras/demo/public/css/bootstrap-theme.css +384 -0
- data/extras/demo/public/css/bootstrap-theme.min.css +1 -0
- data/extras/demo/public/css/bootstrap.css +6805 -0
- data/extras/demo/public/css/bootstrap.min.css +9 -0
- data/extras/demo/public/css/main.css +22 -0
- data/extras/demo/public/favicon.ico +0 -0
- data/extras/demo/public/fonts/glyphicons-halflings-regular.eot +0 -0
- data/extras/demo/public/fonts/glyphicons-halflings-regular.svg +228 -0
- data/extras/demo/public/fonts/glyphicons-halflings-regular.ttf +0 -0
- data/extras/demo/public/fonts/glyphicons-halflings-regular.woff +0 -0
- data/extras/demo/public/html.html +262 -0
- data/extras/demo/public/index.html +110 -0
- data/extras/demo/public/js/main.js +1 -0
- data/extras/demo/public/js/vendor/bootstrap.js +1999 -0
- data/extras/demo/public/js/vendor/bootstrap.min.js +6 -0
- data/extras/demo/public/js/vendor/jquery-1.10.1.min.js +6 -0
- data/extras/demo/public/js/vendor/modernizr-2.6.2-respond-1.1.0.min.js +11 -0
- data/extras/drafts/parsing.md +137 -0
- data/extras/support/typo +66 -0
- data/features/bowler.feature +8 -0
- data/features/html.feature +229 -0
- data/features/step_definitions/bowler_steps.rb +39 -0
- data/features/step_definitions/html_steps.rb +11 -0
- data/features/support/env.rb +7 -0
- data/images/owl-old.png +0 -0
- data/images/owl-old.xcf +0 -0
- data/images/owl.png +0 -0
- data/images/owl.xcf +0 -0
- data/lib/qipowl/bowlers/cmd.rb +26 -0
- data/lib/qipowl/bowlers/html.rb +409 -0
- data/lib/qipowl/bowlers/htmldoc.rb +268 -0
- data/lib/qipowl/bowlers/yaml.rb +63 -0
- data/lib/qipowl/core/bowler.rb +251 -0
- data/lib/qipowl/core/mapper.rb +92 -0
- data/lib/qipowl/core/monkeypatches.rb +168 -0
- data/lib/qipowl/core/ruler.rb +106 -0
- data/lib/qipowl/utils/hash_recursive_merge.rb +72 -0
- data/lib/qipowl/utils/logging.rb +14 -0
- data/lib/qipowl/version.rb +3 -0
- data/lib/qipowl.rb +50 -0
- data/qipowl.gemspec +42 -0
- data/qipowl.komodoproject +4 -0
- data/spec/bowler_spec.rb +11 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/string_spec.rb +32 -0
- data/spec/yaml_test.yaml +10 -0
- metadata +254 -0
@@ -0,0 +1,409 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'htmlbeautifier'
|
5
|
+
|
6
|
+
require_relative '../core/bowler'
|
7
|
+
require_relative '../bowlers/htmldoc'
|
8
|
+
|
9
|
+
module Qipowl
|
10
|
+
# Module placeholder for dynamically created bowlers
|
11
|
+
module Bowlers
|
12
|
+
class Html < Bowler
|
13
|
+
##############################################################################
|
14
|
+
### Default handlers for all the types of markup ###
|
15
|
+
##############################################################################
|
16
|
+
|
17
|
+
# `:grip` default handler
|
18
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
19
|
+
# @return [Array] the array of words with trimmed `grip` tag
|
20
|
+
def ∀_grip *args
|
21
|
+
text = [*args].join(SEPARATOR)
|
22
|
+
mine, rest = text.split("#{__callee__}∎", 2)
|
23
|
+
[tagify(∃_grip_tag(__callee__), {:class => ∃_grip(__callee__)[:class]}, mine), rest]
|
24
|
+
end
|
25
|
+
|
26
|
+
# `:alone` default handler
|
27
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
28
|
+
# @return [Array] the array of words with prepended `alone` tag
|
29
|
+
def ∀_alone *args
|
30
|
+
[standalone(∃_alone_tag(__callee__), {:class => ∃_alone(__callee__)[:class]}), args]
|
31
|
+
end
|
32
|
+
|
33
|
+
# `:block` default handler
|
34
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
35
|
+
# @param [String] param the text to be places on the same string as
|
36
|
+
# opening tag
|
37
|
+
# @return [Nil] nil
|
38
|
+
def ∀_block param, args
|
39
|
+
harvest __callee__,
|
40
|
+
tagify(
|
41
|
+
∃_block_tag(__callee__),
|
42
|
+
{:class => (param.strip.empty? ? ∃_block(__callee__)[:class] : param.strip)},
|
43
|
+
args.hsub(String::HTML_ENTITIES)
|
44
|
+
)
|
45
|
+
end
|
46
|
+
|
47
|
+
# `:magnet` default handler
|
48
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
49
|
+
# @return [Array] the array of words with trimmed `magnet` tag
|
50
|
+
def ∀_magnet *args
|
51
|
+
param, *rest = args.flatten
|
52
|
+
param = param.unbowl.to_s.prepend("#{__callee__}#{String::NBSP}")
|
53
|
+
[tagify(∃_magnet_tag(__callee__), {:class => ∃_magnet(__callee__)[:class]}, param), rest]
|
54
|
+
end
|
55
|
+
|
56
|
+
# `:regular` default handler
|
57
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
58
|
+
def ∀_regular *args
|
59
|
+
harvest __callee__,
|
60
|
+
tagify(
|
61
|
+
∃_regular_tag(canonize(__callee__)),
|
62
|
+
{:class => ∃_regular(canonize(__callee__))[:class]},
|
63
|
+
args
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
##############################################################################
|
68
|
+
### Grip :: Specific handlers ###
|
69
|
+
##############################################################################
|
70
|
+
# Handler for abbrs.
|
71
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
72
|
+
# @return [Array] the array of words with trimmed `abbr` tag
|
73
|
+
def † *args
|
74
|
+
term, *title = args.flatten
|
75
|
+
mine, rest = [*title].join(SEPARATOR).split("#{__callee__}∎", 2)
|
76
|
+
[tagify(∃_grip_tag(__callee__), {:title => mine, :class => ∃_grip(__callee__)[:class]}, term), rest]
|
77
|
+
end
|
78
|
+
|
79
|
+
# Handler for anchors.
|
80
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
81
|
+
# @return [Array] the array of words with trimmed `a` tag
|
82
|
+
def ⚓ *args
|
83
|
+
href, *title = args.flatten
|
84
|
+
mine, rest = [*title].join(SEPARATOR).split("#{__callee__}∎", 2)
|
85
|
+
href = href.unbowl
|
86
|
+
[
|
87
|
+
case get_href_content(href)
|
88
|
+
when :img
|
89
|
+
standalone :img, { :src => href, :alt => [*mine].join(SEPARATOR), :class => 'inplace' }
|
90
|
+
else
|
91
|
+
tagify ∃_grip_tag(__callee__), {:href => href}, mine
|
92
|
+
end, rest
|
93
|
+
]
|
94
|
+
end
|
95
|
+
|
96
|
+
##############################################################################
|
97
|
+
### Alone :: Specific handlers ###
|
98
|
+
##############################################################################
|
99
|
+
# `:alone` handler for horizontal rule; it differs from default
|
100
|
+
# handler since orphans around must be handled as well.
|
101
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
102
|
+
# @return [Nil] nil
|
103
|
+
def —— *args
|
104
|
+
harvest nil, orphan(args.join(SEPARATOR)) unless args.vacant?
|
105
|
+
harvest __callee__, standalone(∃_alone_tag(__callee__))
|
106
|
+
end
|
107
|
+
|
108
|
+
##############################################################################
|
109
|
+
### Block :: Specific handlers ###
|
110
|
+
##############################################################################
|
111
|
+
# `:block` handler for comment (required because comments are
|
112
|
+
# formatted in HTML in some specific way.)
|
113
|
+
# @param [String] param the text to be places on the same string as opening tag
|
114
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
115
|
+
# @return [Nil] nil
|
116
|
+
def ✍ *args
|
117
|
+
[]
|
118
|
+
end
|
119
|
+
|
120
|
+
##############################################################################
|
121
|
+
### Magnet :: Specific handlers ###
|
122
|
+
##############################################################################
|
123
|
+
# `:magnet` handler for reference to Livejournal user.
|
124
|
+
# @param [String] param the text to be places on the same string as opening tag
|
125
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
126
|
+
# @return [Nil] nil
|
127
|
+
def ✎ *args
|
128
|
+
param, *rest = args.flatten
|
129
|
+
param = param.unbowl
|
130
|
+
ljref = "<span style='white-space: nowrap;'><a href='http://#{param}.livejournal.com/profile?mode=full'><img src='http://l-stat.livejournal.com/img/userinfo.gif' alt='[info]' style='border: 0pt none ; vertical-align: bottom; padding-right: 1px;' height='17' width='17'></a><a href='http://#{param}.livejournal.com/?style=mine'><b>#{param}</b></a></span>"
|
131
|
+
[ljref, rest]
|
132
|
+
end
|
133
|
+
|
134
|
+
def ☇ *args
|
135
|
+
param, *rest = args.flatten
|
136
|
+
[tagify(∃_magnet_tag(__callee__), {:name => param.unbowl}, String::ZERO_WIDTH_SPACE), rest]
|
137
|
+
end
|
138
|
+
|
139
|
+
##############################################################################
|
140
|
+
### Regular :: Specific handlers ###
|
141
|
+
##############################################################################
|
142
|
+
# Handler for Youtube video
|
143
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
144
|
+
# @return [Nil] nil
|
145
|
+
def ✇ *args
|
146
|
+
id, *rest = args.flatten
|
147
|
+
harvest nil, orphan(rest.join(SEPARATOR)) unless rest.vacant?
|
148
|
+
harvest __callee__, %Q(
|
149
|
+
<iframe class='youtube' width='560' height='315' src='http://www.youtube.com/embed/#{id.unbowl}'
|
150
|
+
frameborder='0' allowfullscreen></iframe>
|
151
|
+
)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Handler for standalone pictures and
|
155
|
+
# @todo Make it to understand quotes when there is a plain HTML on the other side
|
156
|
+
#
|
157
|
+
# @param
|
158
|
+
# @return [Nil] nil
|
159
|
+
def ⚘ *args
|
160
|
+
href, *title = args.flatten
|
161
|
+
harvest __callee__, %Q(
|
162
|
+
<figure>
|
163
|
+
<img src='#{href.unbowl}'/>
|
164
|
+
<figcaption>
|
165
|
+
<p>
|
166
|
+
#{title.join(SEPARATOR)}
|
167
|
+
</p>
|
168
|
+
</figcaption>
|
169
|
+
</figure>
|
170
|
+
)
|
171
|
+
end
|
172
|
+
|
173
|
+
# `:regular` handler for data lists (required since data list items
|
174
|
+
# consist of two tags: `dt` and `dd`.)
|
175
|
+
# @param [Array] args the words, gained since last call to {#harvest}
|
176
|
+
# @return [Nil] nil
|
177
|
+
def ▶ *args
|
178
|
+
dt, dd = args.join(SEPARATOR).split(/\s+(?:—)\s+/)
|
179
|
+
harvest __callee__, %Q(
|
180
|
+
#{tagify :dt, {}, dt}
|
181
|
+
#{tagify :dd, {}, dd}
|
182
|
+
)
|
183
|
+
end
|
184
|
+
# Alias for {#▶}, according to YAML rules specifies additional
|
185
|
+
# class for the data list `<dl>` tag behind (`dl-horizontal`.)
|
186
|
+
alias_method :▷, :▶
|
187
|
+
|
188
|
+
protected
|
189
|
+
# Computes the level of the `:linewide` element by counting
|
190
|
+
# preceeding non-breakable spaces. For instance, nested lists
|
191
|
+
# are produced by appending `"\u{00A0}"` to the line item
|
192
|
+
# DSL tag:
|
193
|
+
#
|
194
|
+
# li = "• li1 \u{00A0}• nested 1 \u{00A0}• nested 2 • li2"
|
195
|
+
#
|
196
|
+
# @param [Symbol|String] callee the DSL symbol to get the level information for.
|
197
|
+
# @return [Integer] the level requested.
|
198
|
+
#
|
199
|
+
def level callee
|
200
|
+
(callee = callee.to_s).gsub(/#{String::NBSP}/, '').empty? ?
|
201
|
+
-1 : (0..callee.length-1).each { |i| break i if callee[i] != String::NBSP }
|
202
|
+
end
|
203
|
+
|
204
|
+
def canonize callee
|
205
|
+
callee.to_s.gsub(/^#{String::NBSP}*/, '').to_sym if callee
|
206
|
+
end
|
207
|
+
|
208
|
+
# @see Qipowl::Bowler#harvest
|
209
|
+
#
|
210
|
+
# Additionally it checks if there was a `:linewide` item, requiring
|
211
|
+
# surrounding html element (like `<ul>` aroung several `<li>`s.)
|
212
|
+
#
|
213
|
+
# @param [Symbol] callee of method
|
214
|
+
# @param [String] str to be harvested
|
215
|
+
def harvest callee, str
|
216
|
+
if callee.nil? || callee != @callee
|
217
|
+
level(callee).downto(level(@callee) + 1) { |i|
|
218
|
+
str += i.␚ify
|
219
|
+
} unless ∃_enclosures(canonize(callee)).nil?
|
220
|
+
|
221
|
+
if prev = ∃_enclosures(canonize(@callee))
|
222
|
+
level(@callee).downto(level(callee) + 1) { |i|
|
223
|
+
@yielded.last.sub!(/\A/, opening(prev[:tag], {:class => prev[:class]}))
|
224
|
+
@yielded.each { |s| s.gsub!(/#{i.␚ify}/) { closing(prev[:tag]) } }
|
225
|
+
}
|
226
|
+
end
|
227
|
+
|
228
|
+
@callee = callee
|
229
|
+
end
|
230
|
+
super callee, str
|
231
|
+
end
|
232
|
+
|
233
|
+
private
|
234
|
+
# Hence we cannot simply declare the DSL for it, we need to handle
|
235
|
+
# calls to all the _methods_, starting with those symbols.
|
236
|
+
#
|
237
|
+
# @param [Symbol] method as specified by caller (`method_missing`.)
|
238
|
+
# @param [Array] args as specified by caller (`method_missing`.)
|
239
|
+
# @param [Proc] block as specified by caller (`method_missing`.)
|
240
|
+
#
|
241
|
+
# @return [Array] the array of words
|
242
|
+
def special_handler method, *args, &block
|
243
|
+
# Sublevel markers, e.g. “ •” is level 2 line-item
|
244
|
+
return [method, args].flatten \
|
245
|
+
unless level(method) > 0 && self.class::REGULAR_TAGS.keys.include?(canonize(method))
|
246
|
+
|
247
|
+
self.class.class_eval "alias_method :#{method}, :#{canonize(method)}"
|
248
|
+
send method, args, block
|
249
|
+
end
|
250
|
+
|
251
|
+
# Produces html paragraph tag (`<p>`) with class `owl`.
|
252
|
+
# @see Qipowl::Bowler#orphan
|
253
|
+
# @param str the words, to be put in paragraph tag.
|
254
|
+
# @return [String] tagged words.
|
255
|
+
def orphan str
|
256
|
+
"#{tagify(:p, {}, str.to_s.strip)}"
|
257
|
+
end
|
258
|
+
# Constructs opening html tag for the input given.
|
259
|
+
#
|
260
|
+
# To construct `abbr` tag with `title` _Title_ and class _default_:
|
261
|
+
#
|
262
|
+
# opening :abbr, { :title=>'Title', :class=>'default' }
|
263
|
+
#
|
264
|
+
# @param [String] tag to produce opening tag string from.
|
265
|
+
# @param [Hash] params to be put into opening tag as attributes.
|
266
|
+
# @return [String] opening tag for the input given.
|
267
|
+
def opening tag, params={}
|
268
|
+
attrs = params.inject("") { |m, el| m.prepend " #{el.first}='#{el.last}'" unless el.last.nil? ; m }
|
269
|
+
"<#{tag}#{attrs}>"
|
270
|
+
end
|
271
|
+
|
272
|
+
# Constructs closing html tag for the input given.
|
273
|
+
#
|
274
|
+
# @param [String] tag to produce closing tag string from.
|
275
|
+
# @return [String] opening tag for the input given.
|
276
|
+
def closing tag
|
277
|
+
"</#{tag}>"
|
278
|
+
end
|
279
|
+
|
280
|
+
# (see opening)
|
281
|
+
# Acts most like an {#opening} method, but closes an element inplace
|
282
|
+
# (used for `hr`, `br`, `img`).
|
283
|
+
def standalone tag, params={}
|
284
|
+
opening(tag, params).sub('>', '/>')
|
285
|
+
end
|
286
|
+
# Constructs valid tag for the input given, concatenating
|
287
|
+
# opening and closing tags around the text passed in `args`.
|
288
|
+
#
|
289
|
+
# @param [String] tag to produce html tag string from.
|
290
|
+
# @param [Hash] params to be put into opening tag as attributes.
|
291
|
+
# @param [Array] args the words, to be tagged around.
|
292
|
+
# @return [String] opening tag for the input given.
|
293
|
+
def tagify tag, params, *args
|
294
|
+
text = [*args].join(SEPARATOR)
|
295
|
+
text.vacant? ? '' : "#{opening tag, params}#{text}#{closing tag}"
|
296
|
+
end
|
297
|
+
|
298
|
+
|
299
|
+
# Determines content of remote link by href.
|
300
|
+
# TODO Make image patterns configurable.
|
301
|
+
# @param [String] href link to remote resource
|
302
|
+
# @return [Symbol] content type (`:img` or `:text` currently)
|
303
|
+
def get_href_content href
|
304
|
+
href = href.to_s.unbowl.strip
|
305
|
+
if href.end_with?(* %w{png jpg jpeg gif PNG JPG JPEG GIF})
|
306
|
+
:img
|
307
|
+
elsif /\/\/i\.chzbgr/ =~ href
|
308
|
+
:img
|
309
|
+
else
|
310
|
+
:text
|
311
|
+
end
|
312
|
+
|
313
|
+
# uri = URI(href.to_s.unbowl)
|
314
|
+
# Net::HTTP.start(uri.host, uri.port) do |http|
|
315
|
+
# http.open_timeout = 1
|
316
|
+
# http.read_timeout = 1
|
317
|
+
#
|
318
|
+
# request = Net::HTTP::Head.new uri
|
319
|
+
# response = http.request request
|
320
|
+
# case response.to_hash["content-type"].first
|
321
|
+
# when /image/ then return :img
|
322
|
+
# when /text/ then return :text
|
323
|
+
# end
|
324
|
+
# end
|
325
|
+
# :unknown
|
326
|
+
#rescue
|
327
|
+
# logger.warn "Unable to determine link [#{href.to_s.unbowl}] type: no internet connection. Reverting to default."
|
328
|
+
# :unknown
|
329
|
+
end
|
330
|
+
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
=begin
|
335
|
+
# Markup processor for Html output.
|
336
|
+
#
|
337
|
+
# This class produces HTML from markup as Markdown does.
|
338
|
+
|
339
|
+
# Amount of unnamed instances of the class (needed for new class name generation)
|
340
|
+
@@inst_count = 0
|
341
|
+
|
342
|
+
|
343
|
+
# `:handshake` default handler
|
344
|
+
# @param [String] from packed as string operand “before”
|
345
|
+
# @param [String] from packed as string operand “after”
|
346
|
+
# @return
|
347
|
+
def ∈ *args
|
348
|
+
from, till, *rest = args.flatten
|
349
|
+
tag = @mapping.handshake(__callee__)
|
350
|
+
tag = tag[:tag] if Hash === tag
|
351
|
+
[tagify(tag, {}, "#{from.unbowl}#{__callee__}#{till.unbowl}".gsub(String::SYMBOL_FOR_SPACE, ' ')), rest]
|
352
|
+
end
|
353
|
+
alias_method :⊂, :∈
|
354
|
+
|
355
|
+
|
356
|
+
|
357
|
+
# @see {Qipowl::Bowler#defreeze}
|
358
|
+
#
|
359
|
+
# Additionally it checks if tag is a `:block` tag and
|
360
|
+
# substitutes all the carriage returns (`$/`) with special symbol
|
361
|
+
# {String::CARRIAGE_RETURN} to prevent format damage.
|
362
|
+
#
|
363
|
+
# @param [String] str to be defreezed
|
364
|
+
def defreeze str
|
365
|
+
str = super str
|
366
|
+
@mapping[:block].each { |tag, htmltag|
|
367
|
+
str.gsub!(/(#{tag})(.*?)$(.*?)(#{tag}|\Z)/m) { |m|
|
368
|
+
"#{$1}('#{$2}', '#{$3}')\n\n"
|
369
|
+
}
|
370
|
+
}
|
371
|
+
str
|
372
|
+
end
|
373
|
+
|
374
|
+
# @see {Qipowl::Bowler#serveup}
|
375
|
+
#
|
376
|
+
# Additionally it beatifies the output HTML
|
377
|
+
#
|
378
|
+
# @param [String] str to be roasted
|
379
|
+
def serveup str
|
380
|
+
result = ''
|
381
|
+
%w(. , : ; ! ? »).map(&:bowl).each { |punct|
|
382
|
+
str.gsub!(/(?:\p{Space}|#{String::CARRIAGE_RETURN})*(#{punct})/, '\1')
|
383
|
+
# str.gsub!(/(#{punct})(?=\p{Alnum})/, '\1 ')
|
384
|
+
}
|
385
|
+
%w(«).map(&:bowl).each { |punct|
|
386
|
+
str.gsub!(/(#{punct})(?:\p{Space}|#{String::CARRIAGE_RETURN})*/, '\1')
|
387
|
+
str.gsub!(/(?<=\p{Alnum})(#{punct})/, ' \1')
|
388
|
+
}
|
389
|
+
served = super(str)
|
390
|
+
begin
|
391
|
+
HtmlBeautifier::Beautifier.new(result).scan(served)
|
392
|
+
rescue
|
393
|
+
logger.error "Was unable to tidyfy resulting HTML. Returning as is."
|
394
|
+
result = served
|
395
|
+
end
|
396
|
+
result
|
397
|
+
end
|
398
|
+
|
399
|
+
end
|
400
|
+
|
401
|
+
if __FILE__ == $0
|
402
|
+
|
403
|
+
i = 0
|
404
|
+
Dir.glob("#{File.dirname(__FILE__)}/../../../data/octopress-site/source/_posts/**/*.owl").each {|f|
|
405
|
+
puts "Processing ##{i += 1}: #{f}"
|
406
|
+
Qipowl::Html.parse File.read(f)
|
407
|
+
}
|
408
|
+
end
|
409
|
+
=end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'yaml'
|
6
|
+
require_relative '../core/monkeypatches.rb'
|
7
|
+
require_relative '../utils/hash_recursive_merge.rb'
|
8
|
+
|
9
|
+
module Qipowl
|
10
|
+
|
11
|
+
class HtmlDoc < Nokogiri::XML::SAX::Document
|
12
|
+
attr_reader :qp, :tags
|
13
|
+
def initialize mapping
|
14
|
+
@mapping = mapping
|
15
|
+
@inside = nil
|
16
|
+
@collector = {}
|
17
|
+
@tags = {:inplace => {}, :linewide => {}}
|
18
|
+
@qp = ''
|
19
|
+
@level = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def start_element name, attributes = []
|
23
|
+
current_attrs = Hash[attributes]
|
24
|
+
|
25
|
+
@qp += case name.to_sym
|
26
|
+
when :p, :div
|
27
|
+
if current_attrs['class']
|
28
|
+
@collector[name.to_sym] = "✿_#{name.to_sym}_#{current_attrs['class'].gsub(/\s+/, '_')}".to_sym
|
29
|
+
@tags[:linewide][@collector[name.to_sym]] = "#{name.to_sym}†#{current_attrs['class'].gsub(/\s+/, '†')}".to_sym
|
30
|
+
"\n\n#{@collector[name.to_sym]} "
|
31
|
+
else
|
32
|
+
"\n\n"
|
33
|
+
end
|
34
|
+
when :ul, :ol, :table, :dl
|
35
|
+
@inside = name.to_sym
|
36
|
+
"\n"
|
37
|
+
when :pre then "\n\nΛ\n"
|
38
|
+
when :tr then " ┇ "
|
39
|
+
when :td then " ┆ "
|
40
|
+
when :a
|
41
|
+
@inside = :a
|
42
|
+
@collector[:href] = current_attrs['href']
|
43
|
+
@collector[:name] = current_attrs['name']
|
44
|
+
''
|
45
|
+
when :li then (@inside == :ol) ? "◦ " : "• "
|
46
|
+
when :b, :strong then "≡"
|
47
|
+
when :i, :em, :nobr then "≈"
|
48
|
+
when :strike, :del, :s then "─"
|
49
|
+
when :small then "↓"
|
50
|
+
when :u then "▁"
|
51
|
+
when :code, :tt then "λ"
|
52
|
+
when :dfn, :abbr, :cite
|
53
|
+
@inside = name.to_sym
|
54
|
+
@collector[:title] = current_attrs['title']
|
55
|
+
when :hr then "\n\n——\n\n"
|
56
|
+
when :br then " ⏎\n"
|
57
|
+
when :center then "\n— "
|
58
|
+
when :dt then "▷ "
|
59
|
+
when :dd then " — "
|
60
|
+
when :h1 then "§1 "
|
61
|
+
when :h2 then "§2 "
|
62
|
+
when :h3 then "§3 "
|
63
|
+
when :h4 then "§4 "
|
64
|
+
when :h5 then "§5 "
|
65
|
+
when :h6 then "§6 "
|
66
|
+
when :blockquote then "\n\n〉 "
|
67
|
+
when :figure
|
68
|
+
@inside = :figure
|
69
|
+
"\n\n"
|
70
|
+
when :figcaption then " "
|
71
|
+
when :img then fix_href(current_attrs['src'])
|
72
|
+
when :span, :sup
|
73
|
+
if current_attrs['class'].nil?
|
74
|
+
''
|
75
|
+
else
|
76
|
+
@collector[name.to_sym] = "✿_span_#{current_attrs['class'].gsub(/\s+/, '_')}".to_sym
|
77
|
+
@tags[:inplace][@collector[name.to_sym]] = "span†#{current_attrs['class'].gsub(/\s+/, '†')}".to_sym
|
78
|
+
" #{@collector[name.to_sym]}"
|
79
|
+
end
|
80
|
+
when :embed, :iframe then "\n\n#{current_attrs['src']}\n\n"
|
81
|
+
when :html, :body, :object, :param, :thead, :tbody, :font, :'lj-embed', :'lj-cut'
|
82
|
+
''
|
83
|
+
else
|
84
|
+
raise "=== Unhandled: #{name} with attrs: [#{current_attrs}]"
|
85
|
+
''
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def characters str
|
90
|
+
case @inside
|
91
|
+
when :a, :dfn, :abbr
|
92
|
+
@collector[:text] = str
|
93
|
+
else
|
94
|
+
@qp += str
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def end_element name
|
99
|
+
@qp += case name.to_sym
|
100
|
+
when :p, :div
|
101
|
+
@collector.delete(name.to_sym)
|
102
|
+
"\n\n"
|
103
|
+
when :a
|
104
|
+
@inside = nil
|
105
|
+
(href= @collector.delete(:href)) ?
|
106
|
+
" #{(@collector.delete(:text) || '').gsub(/\s+/, "\u{00A0}")}¹#{fix_href href} " :
|
107
|
+
"☇ #{@collector.delete(:name)} #{@collector.delete(:text)}"
|
108
|
+
when :dfn, :abbr, :cite
|
109
|
+
@inside = nil
|
110
|
+
result = " #{@collector.delete(:text).gsub(/\s+/, "\u{00A0}")}†#{@collector.delete(:title)}† " rescue ''
|
111
|
+
result
|
112
|
+
when :ul, :ol, :table, :dl
|
113
|
+
@inside = nil
|
114
|
+
"\n"
|
115
|
+
when :li then "\n"
|
116
|
+
when :pre then "\nΛ\n\n"
|
117
|
+
when :b, :strong then "≡"
|
118
|
+
when :i, :em, :nobr then "≈"
|
119
|
+
when :u then "▁"
|
120
|
+
when :dd then "\n"
|
121
|
+
when :strike, :del, :s then "─"
|
122
|
+
when :small then "↓"
|
123
|
+
when :code, :tt then "λ"
|
124
|
+
when :span, :sup
|
125
|
+
"#{@collector.delete(name.to_sym)} "
|
126
|
+
when :h1, :h2, :h3, :h4, :h5, :h6 then "\n\n"
|
127
|
+
when :blockquote then "\n\n"
|
128
|
+
when :figure
|
129
|
+
@inside = nil
|
130
|
+
"\n\n"
|
131
|
+
else
|
132
|
+
''
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
def fix_href href, site = 'http://mudasobwa.ru/'
|
138
|
+
href.start_with?('http') ? href : href.gsub(/\A\/+/, '').prepend(site)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
143
|
+
|
144
|
+
if __FILE__ == $0
|
145
|
+
|
146
|
+
def prepare str
|
147
|
+
str.gsub(/&[nm]dash;/, '—') # dashes
|
148
|
+
.gsub(/ /, ' ') # dashes
|
149
|
+
.gsub(/\s+--\s+/, ' — ') # dashes
|
150
|
+
.gsub(/^\s*/, '') # leading spaces
|
151
|
+
.gsub(/<img src="\/i\/>/, '')
|
152
|
+
.gsub(/™/, '™') # other entities
|
153
|
+
.gsub(/©/, '©') # other entities
|
154
|
+
.gsub(/(1st@1stone.ru|am@secondiary.ru)/, 'am@mudasobwa.ru')
|
155
|
+
.gsub(/http:\/\/(www\.)?(secondiary|1stone|matiouchkine.net)\.ru/, 'http://mudasobwa.ru') # obsolete site name
|
156
|
+
.gsub(/\[(http[^\]]*)\]/, '\1') # obsolete markdown pics
|
157
|
+
.gsub(/<span>\s*<\/span>/, 'λ\1λ') # obsolete markdown pics
|
158
|
+
.gsub(/<lj (?:comm|user)="(.*?)">/, '✎ \1') # obsolete markdown pics
|
159
|
+
.gsub(/<([^<>]*?@[^<>]*?)>/, '\1') # obsolete markdown pics
|
160
|
+
.gsub(/<imgsrc=/, '<img src=') # obsolete markdown pics
|
161
|
+
.gsub(/<ahref=/, '<a href=') # obsolete markdown pics
|
162
|
+
.gsub(/<\/p>\s*<p>\s*—/, " ⏎\n—") # direct speech
|
163
|
+
.gsub(/<br(?:\s*\/?\s*)>\s*<br(?:\s*\/?\s*)>/, "\n\n") # old-fashioned carriage
|
164
|
+
.gsub(/<[!]--[^<>]*?-->/, '') # comments
|
165
|
+
# .gsub(/([\.,:;!?])(?=\S)/, '\1 ') # fix punctuation
|
166
|
+
end
|
167
|
+
|
168
|
+
def postpare str
|
169
|
+
str.gsub(/\R{2,}/, "\n\n")
|
170
|
+
.gsub(/\A(\s|⏎)*/, '')
|
171
|
+
.gsub(/(\s|⏎)*\Z/, '')
|
172
|
+
end
|
173
|
+
|
174
|
+
tags = {
|
175
|
+
:magnet => {:✎ => :lj, :☇ => :a},
|
176
|
+
:inplace => {:▁ => :u, :─ => :del},
|
177
|
+
:linewide => {:☛ => :twit},
|
178
|
+
:block => {:✁ => :cut}
|
179
|
+
}
|
180
|
+
file = "#{File.dirname(__FILE__)}/../../../data/internals/posts.csv"
|
181
|
+
file_errors = "#{File.dirname(__FILE__)}/../../../data/internals/errors.txt"
|
182
|
+
FileUtils.rm file_errors if File.exist? file_errors
|
183
|
+
|
184
|
+
FileUtils.mkdir("#{File.dirname(__FILE__)}/../../../data/site")
|
185
|
+
# %w{txt pic ref twt}.each {|d| FileUtils.mkdir("#{File.dirname(__FILE__)}/../../../data/site/#{d}")}
|
186
|
+
|
187
|
+
puts "Reading #{file} …"
|
188
|
+
File.readlines(file).each { |l|
|
189
|
+
data = l.split('☢')
|
190
|
+
puts "Processing record #{data[0]}"
|
191
|
+
begin
|
192
|
+
html_doc = Qipowl::HtmlDoc.new nil
|
193
|
+
parser = Nokogiri::HTML::SAX::Parser.new(html_doc)
|
194
|
+
parser.parse(prepare data[2])
|
195
|
+
tags.rmerge! html_doc.tags
|
196
|
+
body = postpare(html_doc.qp)
|
197
|
+
|
198
|
+
body = body.strip if body
|
199
|
+
|
200
|
+
id = data[0]
|
201
|
+
title = data[1].gsub(/'/, "’")
|
202
|
+
date = data[3]
|
203
|
+
img = data[4]
|
204
|
+
|
205
|
+
if img && !img.empty? && !img.start_with?('http://')
|
206
|
+
img = "http://mudasobwa.ru/i/#{img.gsub(/\A\/+/, '')}"
|
207
|
+
end
|
208
|
+
|
209
|
+
q_doc = Qipowl::HtmlDoc.new nil
|
210
|
+
q_parser = Nokogiri::HTML::SAX::Parser.new(q_doc)
|
211
|
+
q_parser.parse(prepare data[5])
|
212
|
+
tags.rmerge! q_doc.tags
|
213
|
+
quote = postpare(q_doc.qp)
|
214
|
+
|
215
|
+
q_url = data[6]
|
216
|
+
type = data[7].to_i # 1 => text, 2 => image, 3 => quote, 4 => twit
|
217
|
+
stype = case type
|
218
|
+
when 1 then :txt
|
219
|
+
when 2 then :pic
|
220
|
+
when 3 then :ref
|
221
|
+
when 4 then :twt
|
222
|
+
else :txt
|
223
|
+
end
|
224
|
+
|
225
|
+
owl_text = %Q(---
|
226
|
+
title: '#{title}'
|
227
|
+
id: #{id}
|
228
|
+
date: '#{date}'
|
229
|
+
categories: [#{stype}]
|
230
|
+
---
|
231
|
+
|
232
|
+
)
|
233
|
+
# owl_text << (type == 4 ? "☛ " : "§1 ")
|
234
|
+
# owl_text << title
|
235
|
+
# owl_text << "\n\n"
|
236
|
+
|
237
|
+
owl_text << case type
|
238
|
+
when 2
|
239
|
+
"#{img} #{body.gsub(/\R/, ' ⏎ ')}"
|
240
|
+
when 3
|
241
|
+
q_ref = q_url[/http:\/\/(.*?)\/|\Z/, 1].split('.').last(2).join('.') rescue nil
|
242
|
+
"\n〉 #{quote.strip}\n‒ #{q_ref ? q_ref : q_url}, #{q_url}\n\n#{body}"
|
243
|
+
else body
|
244
|
+
end
|
245
|
+
|
246
|
+
fname = "#{date.split.first}-#{title.to_filename}.owl"
|
247
|
+
fname = (1..100).each {|i|
|
248
|
+
break "#{date.split.first}-#{title.to_filename}-#{i}.owl" \
|
249
|
+
unless File.exist?("#{File.dirname(__FILE__)}/../../../data/site/#{date.split.first}-#{title.to_filename}-#{i}.owl")
|
250
|
+
} if File.exist?("#{File.dirname(__FILE__)}/../../../data/site/#{fname}")
|
251
|
+
File.open("#{File.dirname(__FILE__)}/../../../data/site/#{fname}", 'a') { |f| f.write(owl_text) }
|
252
|
+
|
253
|
+
rescue Exception => e
|
254
|
+
puts '—'*40
|
255
|
+
puts 'Error occured'
|
256
|
+
puts prepare(data[2])
|
257
|
+
puts '—'*40
|
258
|
+
puts prepare(data[5])
|
259
|
+
puts '—'*40
|
260
|
+
raise e
|
261
|
+
end
|
262
|
+
}
|
263
|
+
|
264
|
+
File.open("#{File.dirname(__FILE__)}/../../../data/site/rules.yaml", 'a') { |f|
|
265
|
+
f.write(tags.to_yaml)
|
266
|
+
}
|
267
|
+
|
268
|
+
end
|