cjk_auto_space 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +8 -0
- data/Rakefile +13 -0
- data/ext/cjk_auto_space/cjk_auto_space.c +177 -0
- data/ext/cjk_auto_space/extconf.rb +10 -0
- data/lib/cjk_auto_space.rb +12 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 46f97e06e96ab1a04cb91ef23adac2300581af29d99b40485e98dd5059d6ba9b
|
4
|
+
data.tar.gz: 9a0d0d19b838185be702124b44c2f3a3c4a26a75036daf5195e3a6e96182d3ff
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f60f7c271f76cffa17ddfa7b213303ed5de88b699e891978372c27d181ef610f68443d2352692c8bb78a22826e00a3a9f6903261bf9c941094cdc05af7b79ad2
|
7
|
+
data.tar.gz: 3004cea02af4e227af18a0c7be436e844f638e49f80f6f156b2c7d799604c4c6f105540e69f99a3dceab01311cea69643d3c2f8ef48f38627f4c2aa79f882ec1
|
data/README.md
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
/*
|
2
|
+
gcc ./cjkpad.c -o cjkpad `pkg-config --libs --cflags icu-uc icu-io`
|
3
|
+
*/
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <string.h>
|
7
|
+
#include <unicode/utext.h>
|
8
|
+
/*#include <unicode/ustdio.h>*/
|
9
|
+
#include "ruby.h"
|
10
|
+
#include "extconf.h"
|
11
|
+
|
12
|
+
static UErrorCode err = U_ZERO_ERROR;
|
13
|
+
|
14
|
+
UBool u_iscjk(int ublock);
|
15
|
+
|
16
|
+
typedef struct gra {
|
17
|
+
UChar32 cp; // code point
|
18
|
+
UBlockCode block; // unicode block
|
19
|
+
size_t start; // starting pos at the str[]
|
20
|
+
size_t length; // number of char needed (cjk is mostly 3 char)
|
21
|
+
bool cjk;
|
22
|
+
} gra;
|
23
|
+
|
24
|
+
UBool u_isMarkdown(gra* g, bool prefix);
|
25
|
+
|
26
|
+
char* padCjk(const char* str) {
|
27
|
+
char* formatted = calloc(1024, sizeof(char));
|
28
|
+
|
29
|
+
gra* graphemes = malloc(1024 * sizeof(gra));
|
30
|
+
int graLen; // number of graphemes
|
31
|
+
|
32
|
+
UText *ut = utext_openUTF8(NULL, str, -1, &err);
|
33
|
+
|
34
|
+
int begin = 0;
|
35
|
+
int end;
|
36
|
+
int i = 0;
|
37
|
+
int blk;
|
38
|
+
int filled = 0;
|
39
|
+
|
40
|
+
for (UChar32 cp = utext_next32From(ut, 0);
|
41
|
+
cp > -1;
|
42
|
+
cp = utext_next32(ut), i++) {
|
43
|
+
end = utext_getNativeIndex(ut);
|
44
|
+
graphemes[i].cp = cp;
|
45
|
+
blk = ublock_getCode(cp);
|
46
|
+
graphemes[i].block = blk;
|
47
|
+
graphemes[i].start = begin;
|
48
|
+
graphemes[i].length = end - begin;
|
49
|
+
graphemes[i].cjk = u_iscjk(blk);
|
50
|
+
|
51
|
+
begin = end;
|
52
|
+
}
|
53
|
+
|
54
|
+
graLen = i;
|
55
|
+
|
56
|
+
filled += graphemes[0].length;
|
57
|
+
strncat(formatted, &str[graphemes[0].start], graphemes[0].length);
|
58
|
+
|
59
|
+
for (int j = 1; j <= graLen; j++) {
|
60
|
+
gra *prev = &graphemes[j-1];
|
61
|
+
gra *curr = &graphemes[j];
|
62
|
+
gra *next = &graphemes[j+1];
|
63
|
+
|
64
|
+
//u_printf("%C%C%C\n", prev->cp, curr->cp, next->cp);
|
65
|
+
|
66
|
+
if ((u_isMarkdown(curr, true) && !prev->cjk && !u_isspace(prev->cp) && next->cjk) || // d*哈
|
67
|
+
(!prev->cjk && !u_isspace(prev->cp) && !u_isMarkdown(prev, true) && curr->cjk) || // *哈
|
68
|
+
(!curr->cjk && !u_isspace(curr->cp) && !u_isMarkdown(curr, false) && prev->cjk)) { // 哈*
|
69
|
+
filled++;
|
70
|
+
if (filled == strlen(formatted)-1) {
|
71
|
+
formatted = realloc(formatted, strlen(formatted) * 2 * sizeof(char));
|
72
|
+
}
|
73
|
+
strcat(formatted, " ");
|
74
|
+
}
|
75
|
+
filled += curr->length;
|
76
|
+
if (filled == strlen(formatted)-1) {
|
77
|
+
formatted = realloc(formatted, strlen(formatted) * 2 * sizeof(char));
|
78
|
+
}
|
79
|
+
strncat(formatted, &str[curr->start], curr->length);
|
80
|
+
|
81
|
+
if (u_isMarkdown(curr, false) && prev->cjk && !u_isspace(next->cp) && !next->cjk) { // 哈*n
|
82
|
+
filled++;
|
83
|
+
if (filled == strlen(formatted)-1) {
|
84
|
+
formatted = realloc(formatted, strlen(formatted) * 2 * sizeof(char));
|
85
|
+
}
|
86
|
+
strcat(formatted, " ");
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
//printf("\n%s\n", formatted);
|
91
|
+
free(graphemes);
|
92
|
+
|
93
|
+
return formatted;
|
94
|
+
}
|
95
|
+
|
96
|
+
UBool u_isMarkdown(gra* g, bool prefix) {
|
97
|
+
UBool markdown;
|
98
|
+
|
99
|
+
switch (g->cp) {
|
100
|
+
case 0x005F: // _
|
101
|
+
case 0x002A: // *
|
102
|
+
case 0x0060: // `
|
103
|
+
markdown = 1;
|
104
|
+
break;
|
105
|
+
case 0x003E:
|
106
|
+
case 0x005B:
|
107
|
+
if (prefix) {
|
108
|
+
markdown = 1;
|
109
|
+
break;
|
110
|
+
}
|
111
|
+
case 0x005D:
|
112
|
+
if (!prefix) {
|
113
|
+
markdown = 1;
|
114
|
+
break;
|
115
|
+
}
|
116
|
+
default:
|
117
|
+
markdown = 0;
|
118
|
+
break;
|
119
|
+
};
|
120
|
+
return markdown;
|
121
|
+
}
|
122
|
+
|
123
|
+
UBool u_iscjk(int ublock) {
|
124
|
+
UBool cjk;
|
125
|
+
|
126
|
+
switch (ublock) {
|
127
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
|
128
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
|
129
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
|
130
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
|
131
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
|
132
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E:
|
133
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F:
|
134
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G:
|
135
|
+
case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H:
|
136
|
+
case UBLOCK_CJK_RADICALS_SUPPLEMENT:
|
137
|
+
case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
|
138
|
+
case UBLOCK_HIRAGANA:
|
139
|
+
case UBLOCK_KATAKANA:
|
140
|
+
case UBLOCK_BOPOMOFO:
|
141
|
+
case UBLOCK_BOPOMOFO_EXTENDED:
|
142
|
+
case UBLOCK_KANBUN:
|
143
|
+
case UBLOCK_CJK_STROKES:
|
144
|
+
case UBLOCK_KATAKANA_PHONETIC_EXTENSIONS:
|
145
|
+
case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS:
|
146
|
+
case UBLOCK_CJK_COMPATIBILITY:
|
147
|
+
case UBLOCK_HANGUL_JAMO:
|
148
|
+
case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
|
149
|
+
case UBLOCK_HANGUL_JAMO_EXTENDED_A:
|
150
|
+
case UBLOCK_HANGUL_SYLLABLES:
|
151
|
+
case UBLOCK_HANGUL_JAMO_EXTENDED_B:
|
152
|
+
case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS:
|
153
|
+
case UBLOCK_CJK_COMPATIBILITY_FORMS:
|
154
|
+
case UBLOCK_KANA_EXTENDED_B:
|
155
|
+
case UBLOCK_KANA_SUPPLEMENT:
|
156
|
+
case UBLOCK_KANA_EXTENDED_A:
|
157
|
+
case UBLOCK_SMALL_KANA_EXTENSION:
|
158
|
+
cjk = 1;
|
159
|
+
break;
|
160
|
+
default:
|
161
|
+
cjk = 0;
|
162
|
+
break;
|
163
|
+
};
|
164
|
+
return cjk;
|
165
|
+
}
|
166
|
+
|
167
|
+
static VALUE rb_pad_cjk(VALUE self) {
|
168
|
+
Check_Type(self, T_STRING);
|
169
|
+
|
170
|
+
char* in = StringValueCStr(self);
|
171
|
+
char* out = padCjk(in);
|
172
|
+
return rb_str_new_cstr(out);
|
173
|
+
}
|
174
|
+
|
175
|
+
void Init_cjk_auto_space(void) {
|
176
|
+
rb_define_method(rb_cString, "cjk_auto_space", rb_pad_cjk, 0);
|
177
|
+
}
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
dir_config("/usr/lib64", "/usr/lib", "/usr/include/unicode")
|
4
|
+
|
5
|
+
abort("missing \"unicode/utext.h\"") unless find_header("unicode/utext.h")
|
6
|
+
find_library("icuuc", "utext_openUTF8", "/usr/lib64", "/usr/lib")
|
7
|
+
abort("missing \"utext_openUTF8\"") unless have_library("icuuc", "utext_openUTF8", "unicode/utext.h")
|
8
|
+
|
9
|
+
create_header
|
10
|
+
create_makefile 'cjk_auto_space/cjk_auto_space'
|
@@ -0,0 +1,12 @@
|
|
1
|
+
begin
|
2
|
+
ruby_version = /(\d+\.\d+)/.match(::RUBY_VERSION)
|
3
|
+
require_relative "cjk_auto_space/#{ruby_version}/cjk_auto_space"
|
4
|
+
rescue LoadError
|
5
|
+
require "cjk_auto_space/cjk_auto_space"
|
6
|
+
end
|
7
|
+
|
8
|
+
String.class_eval do
|
9
|
+
def cjk_auto_space!
|
10
|
+
sub!(self, self.cjk_auto_space)
|
11
|
+
end
|
12
|
+
end
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cjk_auto_space
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Marguerite Su
|
8
|
+
- Shenlebantongying
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: A rubygem that automatically add sapce(U+0020) between CJK and other
|
15
|
+
unicode code points, for better reading experience, with Markdown syntax concerned.
|
16
|
+
email:
|
17
|
+
- marguerite@opensuse.org
|
18
|
+
- shenlebantongying@gmail.com
|
19
|
+
executables: []
|
20
|
+
extensions:
|
21
|
+
- ext/cjk_auto_space/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- ext/cjk_auto_space/cjk_auto_space.c
|
27
|
+
- ext/cjk_auto_space/extconf.rb
|
28
|
+
- lib/cjk_auto_space.rb
|
29
|
+
homepage: https://github.com/openSUSE-zh/rubygem-cjk_auto_space
|
30
|
+
licenses:
|
31
|
+
- MIT
|
32
|
+
metadata: {}
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
require_paths:
|
36
|
+
- lib
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
requirements: []
|
48
|
+
rubygems_version: 3.4.1
|
49
|
+
signing_key:
|
50
|
+
specification_version: 4
|
51
|
+
summary: Automatically add spacing between CJK and others with MarkDown syntax concerned
|
52
|
+
test_files: []
|