text_clean 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/text_clean/text_clean.cc +21 -9
- data/lib/text_clean/version.rb +1 -1
- data/spec/text_clean_spec.rb +31 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb04963f984de57939786cb051eb1096f04fa416
|
4
|
+
data.tar.gz: 1f811f6b548b3e267f962d327b63bb9df8aef61d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f9e02f59b7900ea3ff595fd0f07a344a70913a9a1f0ee484fe46b02e4459ee1a749fc554c220fe0573563467f96aaa9a2fb7c32fee64bf2b3e106be8cd0e3da
|
7
|
+
data.tar.gz: 77f6299a203f3129269ca29fa817dcbb935eaa2c1a55229eab24927e9ad10dad6b4533f5c3f1222ea2a878dd52fe382a59f07e8b0e82720808fce0f2071da9a5
|
@@ -27,13 +27,13 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
27
27
|
char* read;
|
28
28
|
char* write = text;
|
29
29
|
uint8_t just_added_space = true, // prevent prefix spaces
|
30
|
-
|
30
|
+
just_added_line_sep = false;
|
31
31
|
for (read = text; read < eos; read++) {
|
32
32
|
char c = *read;
|
33
33
|
if (c >= 'A' && c <= 'Z') {
|
34
34
|
// Change upper case to lowercase
|
35
35
|
c += 32;
|
36
|
-
} else if (c == '\t' || c == ',' || c == '&' || c == '/') {
|
36
|
+
} else if (c == '\t' || c == '_' || c == ',' || c == '&' || c == '/') {
|
37
37
|
// Change inconsequential punctuation to spaces (i.e. all count as whitespace)
|
38
38
|
c = ' ';
|
39
39
|
} else if (c == '?' || c == '!' || c == ':' || c == ';') {
|
@@ -45,11 +45,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
45
45
|
if (c == '-') {
|
46
46
|
// double dash?
|
47
47
|
if (*(read + 1) == '-') {
|
48
|
-
if (!just_added_space) {
|
48
|
+
if (!just_added_space && !just_added_line_sep) {
|
49
49
|
*write++ = ' ';
|
50
50
|
read++;
|
51
51
|
just_added_space = true;
|
52
|
-
|
52
|
+
just_added_line_sep = false;
|
53
53
|
}
|
54
54
|
} else {
|
55
55
|
// scan ahead to see if this hyphen is at the end of the line
|
@@ -68,20 +68,32 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
|
|
68
68
|
}
|
69
69
|
}
|
70
70
|
}
|
71
|
-
} else if (c == '.' && !
|
71
|
+
} else if (c == '.' && !just_added_line_sep) {
|
72
|
+
// look-behind and see if this is an abbreviation
|
73
|
+
if (write - text >= 2) {
|
74
|
+
char a = *(write - 2);
|
75
|
+
char b = *(write - 1);
|
76
|
+
|
77
|
+
// we're just checking for single-letter abbrevs, so see if 2-chars-behind is whitespace
|
78
|
+
if (a == ' ' || a == '.' || a == '\n' || a == '\t') {
|
79
|
+
*write++ = '.';
|
80
|
+
continue;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
72
84
|
// erase space before period
|
73
85
|
if (just_added_space) write--;
|
74
86
|
*write++ = line_sep;
|
75
|
-
|
87
|
+
just_added_line_sep = true;
|
76
88
|
just_added_space = false;
|
77
|
-
} else if ((c == ' ' || c == '\n') && !just_added_space && !
|
89
|
+
} else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_line_sep) {
|
78
90
|
*write++ = ' ';
|
79
91
|
just_added_space = true;
|
80
|
-
|
92
|
+
just_added_line_sep = false;
|
81
93
|
} else if (c == '\'' || (c >= 'a' && c <= 'z')) {
|
82
94
|
*write++ = c;
|
83
95
|
just_added_space = false;
|
84
|
-
|
96
|
+
just_added_line_sep = false;
|
85
97
|
}
|
86
98
|
}
|
87
99
|
// erase space at end of text
|
data/lib/text_clean/version.rb
CHANGED
data/spec/text_clean_spec.rb
CHANGED
@@ -19,12 +19,34 @@ describe TextClean do
|
|
19
19
|
clean_eq("is\t it so?", "is it so\n")
|
20
20
|
end
|
21
21
|
|
22
|
+
context "separates words" do
|
23
|
+
it "with double hyphen" do
|
24
|
+
clean_eq("good--do it", "good do it")
|
25
|
+
end
|
26
|
+
|
27
|
+
it "with newlines" do
|
28
|
+
clean_eq("a\nb\nc", "a b c")
|
29
|
+
end
|
30
|
+
|
31
|
+
it "with underscores" do
|
32
|
+
clean_eq("short_title", "short title")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it "compacts whitespace around double hyphen" do
|
37
|
+
clean_eq("good -- do it", "good do it")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "does not add whitespace when double hypen occurs after end of sentence" do
|
41
|
+
clean_eq("ok.--maybe", "ok\nmaybe")
|
42
|
+
end
|
43
|
+
|
22
44
|
it "treats [,/&] as whitespace" do
|
23
45
|
clean_eq("a,bb&cc/d", "a bb cc d")
|
24
46
|
end
|
25
47
|
|
26
48
|
it "treats [;:!?] as sentence separators" do
|
27
|
-
clean_eq("x;y?z!:q", "x
|
49
|
+
clean_eq("x;y?z!:q", "x|y|z|q", "|")
|
28
50
|
end
|
29
51
|
|
30
52
|
it "joins hyphenated words at line end" do
|
@@ -35,18 +57,6 @@ describe TextClean do
|
|
35
57
|
clean_eq("satis- \t \nfaction", "satisfaction")
|
36
58
|
end
|
37
59
|
|
38
|
-
it "treats a double hyphen as word separator" do
|
39
|
-
clean_eq("good--do it", "good do it")
|
40
|
-
end
|
41
|
-
|
42
|
-
it "compacts whitespace around double hyphen" do
|
43
|
-
clean_eq("good -- do it", "good do it")
|
44
|
-
end
|
45
|
-
|
46
|
-
it "treats newlines as word separator" do
|
47
|
-
clean_eq("a\nb\nc", "a b c")
|
48
|
-
end
|
49
|
-
|
50
60
|
it "ignores numbers" do
|
51
61
|
clean_eq("123abc", "abc")
|
52
62
|
end
|
@@ -58,4 +68,12 @@ describe TextClean do
|
|
58
68
|
it "keeps apostrophes" do
|
59
69
|
clean_eq("dad's", "dad's")
|
60
70
|
end
|
71
|
+
|
72
|
+
it "keeps abbreviated single letters" do
|
73
|
+
clean_eq("a mr t. sawyer", "a mr t. sawyer")
|
74
|
+
end
|
75
|
+
|
76
|
+
it "keeps abbreviated single letters that follow other abbreviations without spaces" do
|
77
|
+
clean_eq("a mr t.j. sawyer", "a mr t.j. sawyer")
|
78
|
+
end
|
61
79
|
end
|