asciidoctor_cjk_breaks 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9c02afe8dd442f8c710ba0d7378e7725aac105a6f50531c380611d318860376e
4
+ data.tar.gz: d3ba448088b097772c1e0e04be5075ae89b3e2072d9ceb08a056f2a3f54fe777
5
+ SHA512:
6
+ metadata.gz: ad97861be354b68c41f5c640667c7ce54ece7be6686287cb73d54a0ae614ebb3acf67f2e8b6b1590101a6b471eb3a1489b9ef96a29d1d25f238502f18d3b0614
7
+ data.tar.gz: 3962921d6e1500c0b825e49bee236ae3cfd0edb828ca8b480694265116fdccfcc8871ce53ceb22021b5ee3018ac6b8ce3d050a808161d6ea587b37131deb1c90
@@ -0,0 +1,32 @@
1
+ = Asciidoctor CJK Breaks
2
+ Kaizhao Zhang
3
+
4
+ Asciidoctor CJK Breaks is an extension for Asciidoctor that suppresses line breaks between east asian characters.
5
+
6
+ Normally newlines in a asciidoc text get rendered as newlines in output html text. Then browsers will usually render those newlines as whitespace (more smart behavior is included in w3c drafts, but not actually implemented by vendors).
7
+
8
+ This extension finds and removes newlines that cannot be converted to space, algorithm matches https://www.w3.org/TR/css-text-3/#line-break-transform[CSS Text Module Level 3]:
9
+
10
+ - If the character immediately before or immediately after the segment break is the zero-width space character (U+200B), then the break is removed, leaving behind the zero-width space.
11
+ - Otherwise, if the East Asian Width property [UAX11] of both the character before and after the segment break is F, W, or H (not A), and neither side is Hangul, then the segment break is removed.
12
+ - Otherwise, the segment break is converted to a space (U+0020).
13
+
14
+ ---
15
+
16
+ This is a Ruby port of https://github.com/markdown-it/markdown-it-cjk-breaks[markdown-it-cjk-breaks].
17
+
18
+ Status: https://github.com/markdown-it/markdown-it-cjk-breaks/commit/15c7a4144e0e6f94fada671a6eb2c7b63e2358f0[15c7a41]
19
+
20
+ == Install
21
+
22
+ $ gem install asciidoctor_cjk_breaks
23
+
24
+ == Usage
25
+
26
+ $ asciidoctor -r asciidoctor_cjk_breaks example.adoc
27
+
28
+ == Test
29
+
30
+ $ ruby test/test_cjk_breaks.rb
31
+
32
+ And then open the converted HTML files in `test/fixtures` directory.
@@ -0,0 +1,85 @@
1
+ require 'asciidoctor/extensions'
2
+ require 'east_asian_width'
3
+
4
+ class CjkBreaksTreeprocessor < Asciidoctor::Extensions::Treeprocessor
5
+ def process(document)
6
+ return unless document.blocks?
7
+
8
+ remove_cjk_breaks document
9
+ nil
10
+ end
11
+
12
+ def remove_cjk_breaks(node)
13
+ node.blocks.each_with_index do |block, index|
14
+ if block.context == :paragraph
15
+ content_changed = false
16
+ # Processing after raw_source -> block.lines -> block.content in asciidoctor.
17
+ # It may be better to make this process while the process from raw_source -> block.lines
18
+ # whose code flow is:
19
+ # -> `Asciidoctor::Block.initialize`
20
+ # -> `Asciidoctor::Helpers.normalize_lines_from_string`.
21
+ lines = block.content.lines
22
+ lines.each_with_index do |line, line_index|
23
+ last_char_idx = line.rindex(/[^\r|\n]/)
24
+ last_char = line[last_char_idx]
25
+ next_line = lines[line_index + 1]
26
+ next_char = next_line[0] if next_line
27
+
28
+ next unless last_char && next_char
29
+
30
+ remove_break = false
31
+
32
+ if last_char == "\u200b" || next_char == "\u200b"
33
+ # remove newline if it's adjacent to ZWSP
34
+ remove_break = true
35
+ elsif EastAsianWidth.east_asian_width(last_char).match?(/^[FWH]$/) &&
36
+ EastAsianWidth.east_asian_width(next_char).match?(/^[FWH]$/)
37
+ # remove newline if both characters are fullwidth (F), wide (W) or
38
+ # halfwidth (H), but not Hangul
39
+ remove_break = true if !hangul?(last_char) && !hangul?(next_char)
40
+ end
41
+
42
+ if remove_break
43
+ lines[line_index] = line.chomp
44
+ content_changed = true
45
+ end
46
+ end
47
+
48
+ if content_changed
49
+ node.blocks[index] = create_paragraph block.document, lines.join(''), block.attributes
50
+ end
51
+ else
52
+ remove_cjk_breaks block
53
+ end
54
+ end
55
+ end
56
+
57
+ REGEXP_HANGUL_CHARS = /
58
+ [
59
+ \u1100-\u11FF
60
+ \u302E
61
+ \u302F
62
+ \u3131-\u318E
63
+ \u3200-\u321E
64
+ \u3260-\u327E
65
+ \uA960-\uA97C
66
+ \uAC00-\uD7A3
67
+ \uD7B0-\uD7C6
68
+ \uD7CB-\uD7FB
69
+ \uFFA0-\uFFBE
70
+ \uFFC2-\uFFC7
71
+ \uFFCA-\uFFCF
72
+ \uFFD2-\uFFD7
73
+ \uFFDA-\uFFDC
74
+ ]
75
+ /x
76
+
77
+ def hangul?(char)
78
+ # require('unicode-10.0.0/Script/Hangul/regex')
79
+ char.match?(REGEXP_HANGUL_CHARS)
80
+ end
81
+ end
82
+
83
+ Asciidoctor::Extensions.register do
84
+ treeprocessor CjkBreaksTreeprocessor
85
+ end
@@ -0,0 +1 @@
1
+ require_relative 'asciidoctor/cjk_breaks_treeprocessor'
@@ -0,0 +1,72 @@
1
+ Remove linebreaks near ZWSP
2
+ .
3
+ foo​
4
+ bar
5
+ ​baz
6
+ .
7
+ <p>foo​bar​baz</p>
8
+ .
9
+
10
+ Remove linebreaks between hiragana (wide) characters
11
+ .
12
+ あおえ
13
+ いう
14
+ .
15
+ <p>あおえいう</p>
16
+ .
17
+
18
+ Remove linebreaks between halfwidth katakana
19
+ .
20
+ アオエ
21
+ イウ
22
+ .
23
+ <p>アオエイウ</p>
24
+ .
25
+
26
+ Remove linebreaks between fullwidth characters
27
+ .
28
+ !"#
29
+ $%
30
+ .
31
+ <p>!"#$%</p>
32
+ .
33
+
34
+ Keep linebreaks between hangul characters
35
+ .
36
+ ㅏㅗㅔ
37
+ ㅣㅜ
38
+ ᅡᅩᅦ
39
+ ᅵᅮ
40
+ .
41
+ <p>ㅏㅗㅔ
42
+ ㅣㅜ
43
+ ᅡᅩᅦ
44
+ ᅵᅮ</p>
45
+ .
46
+
47
+ Keep linebreaks between hiragana (wide) and english
48
+ .
49
+ あおえ
50
+ aoe
51
+ あおえ
52
+ .
53
+ <p>あおえ
54
+ aoe
55
+ あおえ</p>
56
+ .
57
+
58
+ Emphasis tokens should be skipped
59
+ .
60
+ *あおえ*
61
+ *いう*
62
+ .
63
+ <p><em>あおえ</em><em>いう</em></p>
64
+ .
65
+
66
+ Should recognize astral characters correctly
67
+ .
68
+ foo🈀
69
+ 🈀foo
70
+ .
71
+ <p>foo🈀🈀foo</p>
72
+ .
@@ -0,0 +1,85 @@
1
+ 滕王阁序 --- 王勃
2
+
3
+ 豫章故郡,洪都新府。
4
+ 星分翼轸,地接衡庐。
5
+ 襟三江而带五湖,控蛮荆而引瓯越。
6
+ 物华天宝,龙光射牛斗之墟;
7
+ 人杰地灵,徐孺下陈蕃之榻。
8
+ 雄州雾列,俊采星驰。
9
+ 台隍枕夷夏之交,宾主尽东南之美。
10
+ 都督阎公之雅望,棨戟遥临;
11
+ 宇文新州之懿范,襜帷暂驻。
12
+ 十旬休假,胜友如云;
13
+ 千里逢迎,高朋满座。
14
+ 腾蛟起凤,孟学士之词宗;
15
+ 紫电青霜,王将军之武库。
16
+ 家君作宰,路出名区;
17
+ 童子何知,躬逢胜饯。
18
+
19
+ 时维九月,序属三秋。
20
+ 潦水尽而寒潭清,烟光凝而暮山紫。
21
+ 俨骖騑于上路,访风景于崇阿。
22
+ 临帝子之长洲,得仙人之旧馆。
23
+ 层峦耸翠,上出重霄;
24
+ 飞阁流丹,下临无地。
25
+ 鹤汀凫渚,穷岛屿之萦回;
26
+ 桂殿兰宫,即冈峦之体势。
27
+
28
+ 披绣闼,俯雕甍,山原旷其盈视,川泽纡其骇瞩。
29
+ 闾阎扑地,钟鸣鼎食之家;
30
+ 舸舰迷津,青雀黄龙之舳。
31
+ 云销雨霁,彩彻区明。
32
+ 落霞与孤鹜齐飞,秋水共长天一色。
33
+ 渔舟唱晚,响穷彭蠡之滨,雁阵惊寒,声断衡阳之浦。
34
+
35
+ 遥襟甫畅,逸兴遄飞。
36
+ 爽籁发而清风生,纤歌凝而白云遏。
37
+ 睢园绿竹,气凌彭泽之樽;
38
+ 邺水朱华,光照临川之笔。
39
+ 四美具,二难并。
40
+ 穷睇眄于中天,极娱游于暇日。
41
+ 天高地迥,觉宇宙之无穷;
42
+ 兴尽悲来,识盈虚之有数。
43
+ 望长安于日下,目吴会于云间。
44
+ 地势极而南溟深,天柱高而北辰远。
45
+ 关山难越,谁悲失路之人;
46
+ 萍水相逢,尽是他乡之客。
47
+ 怀帝阍而不见,奉宣室以何年?
48
+
49
+ 嗟乎!
50
+ 时运不齐,命途多舛。
51
+ 冯唐易老,李广难封。
52
+ 屈贾谊于长沙,非无圣主;
53
+ 窜梁鸿于海曲,岂乏明时?
54
+ 所赖君子见机,达人知命。
55
+ 老当益壮,宁移白首之心?
56
+ 穷且益坚,不坠青云之志。
57
+ 酌贪泉而觉爽,处涸辙以犹欢。
58
+ 北海虽赊,扶摇可接;
59
+ 东隅已逝,桑榆非晚。
60
+ 孟尝高洁,空余报国之情;
61
+ 阮籍猖狂,岂效穷途之哭!
62
+
63
+ 勃,三尺微命,一介书生。
64
+ 无路请缨,等终军之弱冠;
65
+ 有怀投笔,慕宗悫之长风。
66
+ 舍簪笏于百龄,奉晨昏于万里。
67
+ 非谢家之宝树,接孟氏之芳邻。
68
+ 他日趋庭,叨陪鲤对;
69
+ 今兹捧袂,喜托龙门。
70
+ 杨意不逢,抚凌云而自惜;
71
+ 钟期既遇,奏流水以何惭?
72
+
73
+ 呜乎!
74
+ 胜地不常,盛筵难再;
75
+ 兰亭已矣,梓泽丘墟。
76
+ 临别赠言,幸承恩于伟饯;
77
+ 登高作赋,是所望于群公。
78
+ 敢竭鄙怀,恭疏短引;
79
+ 一言均赋,四韵俱成。
80
+ 请洒潘江,各倾陆海云尔:
81
+
82
+ 滕王高阁临江渚,佩玉鸣鸾罢歌舞。
83
+ 画栋朝飞南浦云,珠帘暮卷西山雨。
84
+ 闲云潭影日悠悠,物换星移几度秋。
85
+ 阁中帝子今何在?槛外长江空自流。
@@ -0,0 +1,4 @@
1
+ require_relative '../lib/asciidoctor_cjk_breaks'
2
+
3
+ Asciidoctor.convert_file 'test/fixtures/cjk_breaks.txt'
4
+ Asciidoctor.convert_file 'test/fixtures/tengwanggexu-wangbo.txt'
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: asciidoctor_cjk_breaks
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kaizhao Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-11-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: asciidoctor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: east_asian_width
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: An extension for Asciidoctor that suppresses line breaks between east
42
+ asian characters.
43
+ email: zhangkaizhao@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - README.adoc
49
+ - lib/asciidoctor/cjk_breaks_treeprocessor.rb
50
+ - lib/asciidoctor_cjk_breaks.rb
51
+ - test/fixtures/cjk_breaks.txt
52
+ - test/fixtures/tengwanggexu-wangbo.txt
53
+ - test/test_cjk_breaks.rb
54
+ homepage: https://github.com/zhangkaizhao/asciidoctor_cjk_breaks
55
+ licenses:
56
+ - MIT
57
+ metadata: {}
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubyforge_project:
74
+ rubygems_version: 2.7.6
75
+ signing_key:
76
+ specification_version: 4
77
+ summary: Suppress line breaks between east asian characters
78
+ test_files: []