text_clean 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
4
- data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
3
+ metadata.gz: bb04963f984de57939786cb051eb1096f04fa416
4
+ data.tar.gz: 1f811f6b548b3e267f962d327b63bb9df8aef61d
5
5
  SHA512:
6
- metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
7
- data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
6
+ metadata.gz: 7f9e02f59b7900ea3ff595fd0f07a344a70913a9a1f0ee484fe46b02e4459ee1a749fc554c220fe0573563467f96aaa9a2fb7c32fee64bf2b3e106be8cd0e3da
7
+ data.tar.gz: 77f6299a203f3129269ca29fa817dcbb935eaa2c1a55229eab24927e9ad10dad6b4533f5c3f1222ea2a878dd52fe382a59f07e8b0e82720808fce0f2071da9a5
@@ -27,13 +27,13 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
27
27
  char* read;
28
28
  char* write = text;
29
29
  uint8_t just_added_space = true, // prevent prefix spaces
30
- just_added_period = false;
30
+ just_added_line_sep = false;
31
31
  for (read = text; read < eos; read++) {
32
32
  char c = *read;
33
33
  if (c >= 'A' && c <= 'Z') {
34
34
  // Change upper case to lowercase
35
35
  c += 32;
36
- } else if (c == '\t' || c == ',' || c == '&' || c == '/') {
36
+ } else if (c == '\t' || c == '_' || c == ',' || c == '&' || c == '/') {
37
37
  // Change inconsequential punctuation to spaces (i.e. all count as whitespace)
38
38
  c = ' ';
39
39
  } else if (c == '?' || c == '!' || c == ':' || c == ';') {
@@ -45,11 +45,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
45
45
  if (c == '-') {
46
46
  // double dash?
47
47
  if (*(read + 1) == '-') {
48
- if (!just_added_space) {
48
+ if (!just_added_space && !just_added_line_sep) {
49
49
  *write++ = ' ';
50
50
  read++;
51
51
  just_added_space = true;
52
- just_added_period = false;
52
+ just_added_line_sep = false;
53
53
  }
54
54
  } else {
55
55
  // scan ahead to see if this hyphen is at the end of the line
@@ -68,20 +68,32 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
68
68
  }
69
69
  }
70
70
  }
71
- } else if (c == '.' && !just_added_period) {
71
+ } else if (c == '.' && !just_added_line_sep) {
72
+ // look-behind and see if this is an abbreviation
73
+ if (write - text >= 2) {
74
+ char a = *(write - 2);
75
+ char b = *(write - 1);
76
+
77
+ // we're just checking for single-letter abbrevs, so see if 2-chars-behind is whitespace
78
+ if (a == ' ' || a == '.' || a == '\n' || a == '\t') {
79
+ *write++ = '.';
80
+ continue;
81
+ }
82
+ }
83
+
72
84
  // erase space before period
73
85
  if (just_added_space) write--;
74
86
  *write++ = line_sep;
75
- just_added_period = true;
87
+ just_added_line_sep = true;
76
88
  just_added_space = false;
77
- } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
89
+ } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_line_sep) {
78
90
  *write++ = ' ';
79
91
  just_added_space = true;
80
- just_added_period = false;
92
+ just_added_line_sep = false;
81
93
  } else if (c == '\'' || (c >= 'a' && c <= 'z')) {
82
94
  *write++ = c;
83
95
  just_added_space = false;
84
- just_added_period = false;
96
+ just_added_line_sep = false;
85
97
  }
86
98
  }
87
99
  // erase space at end of text
@@ -1,3 +1,3 @@
1
1
  module TextClean
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end
@@ -19,12 +19,34 @@ describe TextClean do
19
19
  clean_eq("is\t it so?", "is it so\n")
20
20
  end
21
21
 
22
+ context "separates words" do
23
+ it "with double hyphen" do
24
+ clean_eq("good--do it", "good do it")
25
+ end
26
+
27
+ it "with newlines" do
28
+ clean_eq("a\nb\nc", "a b c")
29
+ end
30
+
31
+ it "with underscores" do
32
+ clean_eq("short_title", "short title")
33
+ end
34
+ end
35
+
36
+ it "compacts whitespace around double hyphen" do
37
+ clean_eq("good -- do it", "good do it")
38
+ end
39
+
40
+ it "does not add whitespace when double hypen occurs after end of sentence" do
41
+ clean_eq("ok.--maybe", "ok\nmaybe")
42
+ end
43
+
22
44
  it "treats [,/&] as whitespace" do
23
45
  clean_eq("a,bb&cc/d", "a bb cc d")
24
46
  end
25
47
 
26
48
  it "treats [;:!?] as sentence separators" do
27
- clean_eq("x;y?z!:q", "x.y.z.q", ".")
49
+ clean_eq("x;y?z!:q", "x|y|z|q", "|")
28
50
  end
29
51
 
30
52
  it "joins hyphenated words at line end" do
@@ -35,18 +57,6 @@ describe TextClean do
35
57
  clean_eq("satis- \t \nfaction", "satisfaction")
36
58
  end
37
59
 
38
- it "treats a double hyphen as word separator" do
39
- clean_eq("good--do it", "good do it")
40
- end
41
-
42
- it "compacts whitespace around double hyphen" do
43
- clean_eq("good -- do it", "good do it")
44
- end
45
-
46
- it "treats newlines as word separator" do
47
- clean_eq("a\nb\nc", "a b c")
48
- end
49
-
50
60
  it "ignores numbers" do
51
61
  clean_eq("123abc", "abc")
52
62
  end
@@ -58,4 +68,12 @@ describe TextClean do
58
68
  it "keeps apostrophes" do
59
69
  clean_eq("dad's", "dad's")
60
70
  end
71
+
72
+ it "keeps abbreviated single letters" do
73
+ clean_eq("a mr t. sawyer", "a mr t. sawyer")
74
+ end
75
+
76
+ it "keeps abbreviated single letters that follow other abbreviations without spaces" do
77
+ clean_eq("a mr t.j. sawyer", "a mr t.j. sawyer")
78
+ end
61
79
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_clean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duane Johnson