RubyGems - asciidoctor_cjk_breaks - Versions diffs - 0.0.1 - Mend

asciidoctor_cjk_breaks 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +7 -0
data/README.adoc +32 -0
data/lib/asciidoctor/cjk_breaks_treeprocessor.rb +85 -0
data/lib/asciidoctor_cjk_breaks.rb +1 -0
data/test/fixtures/cjk_breaks.txt +72 -0
data/test/fixtures/tengwanggexu-wangbo.txt +85 -0
data/test/test_cjk_breaks.rb +4 -0
metadata +78 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 9c02afe8dd442f8c710ba0d7378e7725aac105a6f50531c380611d318860376e
+  data.tar.gz: d3ba448088b097772c1e0e04be5075ae89b3e2072d9ceb08a056f2a3f54fe777
+SHA512:
+  metadata.gz: ad97861be354b68c41f5c640667c7ce54ece7be6686287cb73d54a0ae614ebb3acf67f2e8b6b1590101a6b471eb3a1489b9ef96a29d1d25f238502f18d3b0614
+  data.tar.gz: 3962921d6e1500c0b825e49bee236ae3cfd0edb828ca8b480694265116fdccfcc8871ce53ceb22021b5ee3018ac6b8ce3d050a808161d6ea587b37131deb1c90

data/README.adoc ADDED

@@ -0,0 +1,32 @@
+= Asciidoctor CJK Breaks
+Kaizhao Zhang
+Asciidoctor CJK Breaks is an extension for Asciidoctor that suppresses line breaks between east asian characters.
+Normally newlines in a asciidoc text get rendered as newlines in output html text. Then browsers will usually render those newlines as whitespace (more smart behavior is included in w3c drafts, but not actually implemented by vendors).
+This extension finds and removes newlines that cannot be converted to space, algorithm matches https://www.w3.org/TR/css-text-3/#line-break-transform[CSS Text Module Level 3]:
+- If the character immediately before or immediately after the segment break is the zero-width space character (U+200B), then the break is removed, leaving behind the zero-width space.
+- Otherwise, if the East Asian Width property [UAX11] of both the character before and after the segment break is F, W, or H (not A), and neither side is Hangul, then the segment break is removed.
+- Otherwise, the segment break is converted to a space (U+0020).
+---
+This is a Ruby port of https://github.com/markdown-it/markdown-it-cjk-breaks[markdown-it-cjk-breaks].
+Status: https://github.com/markdown-it/markdown-it-cjk-breaks/commit/15c7a4144e0e6f94fada671a6eb2c7b63e2358f0[15c7a41]
+== Install
+ $ gem install asciidoctor_cjk_breaks
+== Usage
+ $ asciidoctor -r asciidoctor_cjk_breaks example.adoc
+== Test
+ $ ruby test/test_cjk_breaks.rb
+And then open the converted HTML files in `test/fixtures` directory.

data/lib/asciidoctor/cjk_breaks_treeprocessor.rb ADDED

@@ -0,0 +1,85 @@
+require 'asciidoctor/extensions'
+require 'east_asian_width'
+class CjkBreaksTreeprocessor < Asciidoctor::Extensions::Treeprocessor
+  def process(document)
+    return unless document.blocks?
+    remove_cjk_breaks document
+    nil
+  end
+  def remove_cjk_breaks(node)
+    node.blocks.each_with_index do |block, index|
+      if block.context == :paragraph
+        content_changed = false
+        # Processing after raw_source -> block.lines -> block.content in asciidoctor.
+        # It may be better to make this process while the process from raw_source -> block.lines
+        # whose code flow is:
+        # -> `Asciidoctor::Block.initialize`
+        # -> `Asciidoctor::Helpers.normalize_lines_from_string`.
+        lines = block.content.lines
+        lines.each_with_index do |line, line_index|
+          last_char_idx = line.rindex(/[^\r|\n]/)
+          last_char = line[last_char_idx]
+          next_line = lines[line_index + 1]
+          next_char = next_line[0] if next_line
+          next unless last_char && next_char
+          remove_break = false
+          if last_char == "\u200b" || next_char == "\u200b"
+            # remove newline if it's adjacent to ZWSP
+            remove_break = true
+          elsif EastAsianWidth.east_asian_width(last_char).match?(/^[FWH]$/) &&
+                EastAsianWidth.east_asian_width(next_char).match?(/^[FWH]$/)
+            # remove newline if both characters are fullwidth (F), wide (W) or
+            # halfwidth (H), but not Hangul
+            remove_break = true if !hangul?(last_char) && !hangul?(next_char)
+          end
+          if remove_break
+            lines[line_index] = line.chomp
+            content_changed = true
+          end
+        end
+        if content_changed
+          node.blocks[index] = create_paragraph block.document, lines.join(''), block.attributes
+        end
+      else
+        remove_cjk_breaks block
+      end
+    end
+  end
+  REGEXP_HANGUL_CHARS = /
+    [
+    \u1100-\u11FF
+    \u302E
+    \u302F
+    \u3131-\u318E
+    \u3200-\u321E
+    \u3260-\u327E
+    \uA960-\uA97C
+    \uAC00-\uD7A3
+    \uD7B0-\uD7C6
+    \uD7CB-\uD7FB
+    \uFFA0-\uFFBE
+    \uFFC2-\uFFC7
+    \uFFCA-\uFFCF
+    \uFFD2-\uFFD7
+    \uFFDA-\uFFDC
+    ]
+  /x
+  def hangul?(char)
+    # require('unicode-10.0.0/Script/Hangul/regex')
+    char.match?(REGEXP_HANGUL_CHARS)
+  end
+end
+Asciidoctor::Extensions.register do
+  treeprocessor CjkBreaksTreeprocessor
+end

data/lib/asciidoctor_cjk_breaks.rb ADDED

	@@ -0,0 +1 @@
1	+ require_relative 'asciidoctor/cjk_breaks_treeprocessor'

data/test/fixtures/cjk_breaks.txt ADDED

@@ -0,0 +1,72 @@
+Remove linebreaks near ZWSP
+.
+foo
+bar
+baz
+.
+<p>foobarbaz</p>
+.
+Remove linebreaks between hiragana (wide) characters
+.
+あおえ
+いう
+.
+<p>あおえいう</p>
+.
+Remove linebreaks between halfwidth katakana
+.
+ｱｵｴ
+ｲｳ
+.
+<p>ｱｵｴｲｳ</p>
+.
+Remove linebreaks between fullwidth characters
+.
+！＂＃
+＄％
+.
+<p>！＂＃＄％</p>
+.
+Keep linebreaks between hangul characters
+.
+ㅏㅗㅔ
+ㅣㅜ
+ￂￌￇ
+ￜￓ
+.
+<p>ㅏㅗㅔ
+ㅣㅜ
+ￂￌￇ
+ￜￓ</p>
+.
+Keep linebreaks between hiragana (wide) and english
+.
+あおえ
+aoe
+あおえ
+.
+<p>あおえ
+aoe
+あおえ</p>
+.
+Emphasis tokens should be skipped
+.
+*あおえ*
+*いう*
+.
+<p><em>あおえ</em><em>いう</em></p>
+.
+Should recognize astral characters correctly
+.
+foo🈀
+🈀foo
+.
+<p>foo🈀🈀foo</p>
+.

data/test/fixtures/tengwanggexu-wangbo.txt ADDED

@@ -0,0 +1,85 @@
+滕王阁序 --- 王勃
+豫章故郡，洪都新府。
+星分翼轸，地接衡庐。
+襟三江而带五湖，控蛮荆而引瓯越。
+物华天宝，龙光射牛斗之墟；
+人杰地灵，徐孺下陈蕃之榻。
+雄州雾列，俊采星驰。
+台隍枕夷夏之交，宾主尽东南之美。
+都督阎公之雅望，棨戟遥临；
+宇文新州之懿范，襜帷暂驻。
+十旬休假，胜友如云；
+千里逢迎，高朋满座。
+腾蛟起凤，孟学士之词宗；
+紫电青霜，王将军之武库。
+家君作宰，路出名区；
+童子何知，躬逢胜饯。
+时维九月，序属三秋。
+潦水尽而寒潭清，烟光凝而暮山紫。
+俨骖騑于上路，访风景于崇阿。
+临帝子之长洲，得仙人之旧馆。
+层峦耸翠，上出重霄；
+飞阁流丹，下临无地。
+鹤汀凫渚，穷岛屿之萦回；
+桂殿兰宫，即冈峦之体势。
+披绣闼，俯雕甍，山原旷其盈视，川泽纡其骇瞩。
+闾阎扑地，钟鸣鼎食之家；
+舸舰迷津，青雀黄龙之舳。
+云销雨霁，彩彻区明。
+落霞与孤鹜齐飞，秋水共长天一色。
+渔舟唱晚，响穷彭蠡之滨，雁阵惊寒，声断衡阳之浦。
+遥襟甫畅，逸兴遄飞。
+爽籁发而清风生，纤歌凝而白云遏。
+睢园绿竹，气凌彭泽之樽；
+邺水朱华，光照临川之笔。
+四美具，二难并。
+穷睇眄于中天，极娱游于暇日。
+天高地迥，觉宇宙之无穷；
+兴尽悲来，识盈虚之有数。
+望长安于日下，目吴会于云间。
+地势极而南溟深，天柱高而北辰远。
+关山难越，谁悲失路之人；
+萍水相逢，尽是他乡之客。
+怀帝阍而不见，奉宣室以何年？
+嗟乎！
+时运不齐，命途多舛。
+冯唐易老，李广难封。
+屈贾谊于长沙，非无圣主；
+窜梁鸿于海曲，岂乏明时？
+所赖君子见机，达人知命。
+老当益壮，宁移白首之心？
+穷且益坚，不坠青云之志。
+酌贪泉而觉爽，处涸辙以犹欢。
+北海虽赊，扶摇可接；
+东隅已逝，桑榆非晚。
+孟尝高洁，空余报国之情；
+阮籍猖狂，岂效穷途之哭！
+勃，三尺微命，一介书生。
+无路请缨，等终军之弱冠；
+有怀投笔，慕宗悫之长风。
+舍簪笏于百龄，奉晨昏于万里。
+非谢家之宝树，接孟氏之芳邻。
+他日趋庭，叨陪鲤对；
+今兹捧袂，喜托龙门。
+杨意不逢，抚凌云而自惜；
+钟期既遇，奏流水以何惭？
+呜乎！
+胜地不常，盛筵难再；
+兰亭已矣，梓泽丘墟。
+临别赠言，幸承恩于伟饯；
+登高作赋，是所望于群公。
+敢竭鄙怀，恭疏短引；
+一言均赋，四韵俱成。
+请洒潘江，各倾陆海云尔：
+ 滕王高阁临江渚，佩玉鸣鸾罢歌舞。
+ 画栋朝飞南浦云，珠帘暮卷西山雨。
+ 闲云潭影日悠悠，物换星移几度秋。
+ 阁中帝子今何在？槛外长江空自流。

data/test/test_cjk_breaks.rb ADDED

@@ -0,0 +1,4 @@
+require_relative '../lib/asciidoctor_cjk_breaks'
+Asciidoctor.convert_file 'test/fixtures/cjk_breaks.txt'
+Asciidoctor.convert_file 'test/fixtures/tengwanggexu-wangbo.txt'

metadata ADDED

@@ -0,0 +1,78 @@
+--- !ruby/object:Gem::Specification
+name: asciidoctor_cjk_breaks
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Kaizhao Zhang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2018-11-09 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: asciidoctor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+- !ruby/object:Gem::Dependency
+  name: east_asian_width
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+description: An extension for Asciidoctor that suppresses line breaks between east
+  asian characters.
+email: zhangkaizhao@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- README.adoc
+- lib/asciidoctor/cjk_breaks_treeprocessor.rb
+- lib/asciidoctor_cjk_breaks.rb
+- test/fixtures/cjk_breaks.txt
+- test/fixtures/tengwanggexu-wangbo.txt
+- test/test_cjk_breaks.rb
+homepage: https://github.com/zhangkaizhao/asciidoctor_cjk_breaks
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.7.6
+signing_key:
+specification_version: 4
+summary: Suppress line breaks between east asian characters
+test_files: []