text_clean 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
4
- data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
3
+ metadata.gz: bb04963f984de57939786cb051eb1096f04fa416
4
+ data.tar.gz: 1f811f6b548b3e267f962d327b63bb9df8aef61d
5
5
  SHA512:
6
- metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
7
- data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
6
+ metadata.gz: 7f9e02f59b7900ea3ff595fd0f07a344a70913a9a1f0ee484fe46b02e4459ee1a749fc554c220fe0573563467f96aaa9a2fb7c32fee64bf2b3e106be8cd0e3da
7
+ data.tar.gz: 77f6299a203f3129269ca29fa817dcbb935eaa2c1a55229eab24927e9ad10dad6b4533f5c3f1222ea2a878dd52fe382a59f07e8b0e82720808fce0f2071da9a5
@@ -27,13 +27,13 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
27
27
  char* read;
28
28
  char* write = text;
29
29
  uint8_t just_added_space = true, // prevent prefix spaces
30
- just_added_period = false;
30
+ just_added_line_sep = false;
31
31
  for (read = text; read < eos; read++) {
32
32
  char c = *read;
33
33
  if (c >= 'A' && c <= 'Z') {
34
34
  // Change upper case to lowercase
35
35
  c += 32;
36
- } else if (c == '\t' || c == ',' || c == '&' || c == '/') {
36
+ } else if (c == '\t' || c == '_' || c == ',' || c == '&' || c == '/') {
37
37
  // Change inconsequential punctuation to spaces (i.e. all count as whitespace)
38
38
  c = ' ';
39
39
  } else if (c == '?' || c == '!' || c == ':' || c == ';') {
@@ -45,11 +45,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
45
45
  if (c == '-') {
46
46
  // double dash?
47
47
  if (*(read + 1) == '-') {
48
- if (!just_added_space) {
48
+ if (!just_added_space && !just_added_line_sep) {
49
49
  *write++ = ' ';
50
50
  read++;
51
51
  just_added_space = true;
52
- just_added_period = false;
52
+ just_added_line_sep = false;
53
53
  }
54
54
  } else {
55
55
  // scan ahead to see if this hyphen is at the end of the line
@@ -68,20 +68,32 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
68
68
  }
69
69
  }
70
70
  }
71
- } else if (c == '.' && !just_added_period) {
71
+ } else if (c == '.' && !just_added_line_sep) {
72
+ // look-behind and see if this is an abbreviation
73
+ if (write - text >= 2) {
74
+ char a = *(write - 2);
75
+ char b = *(write - 1);
76
+
77
+ // we're just checking for single-letter abbrevs, so see if 2-chars-behind is whitespace
78
+ if (a == ' ' || a == '.' || a == '\n' || a == '\t') {
79
+ *write++ = '.';
80
+ continue;
81
+ }
82
+ }
83
+
72
84
  // erase space before period
73
85
  if (just_added_space) write--;
74
86
  *write++ = line_sep;
75
- just_added_period = true;
87
+ just_added_line_sep = true;
76
88
  just_added_space = false;
77
- } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
89
+ } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_line_sep) {
78
90
  *write++ = ' ';
79
91
  just_added_space = true;
80
- just_added_period = false;
92
+ just_added_line_sep = false;
81
93
  } else if (c == '\'' || (c >= 'a' && c <= 'z')) {
82
94
  *write++ = c;
83
95
  just_added_space = false;
84
- just_added_period = false;
96
+ just_added_line_sep = false;
85
97
  }
86
98
  }
87
99
  // erase space at end of text
@@ -1,3 +1,3 @@
1
1
  module TextClean
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end
@@ -19,12 +19,34 @@ describe TextClean do
19
19
  clean_eq("is\t it so?", "is it so\n")
20
20
  end
21
21
 
22
+ context "separates words" do
23
+ it "with double hyphen" do
24
+ clean_eq("good--do it", "good do it")
25
+ end
26
+
27
+ it "with newlines" do
28
+ clean_eq("a\nb\nc", "a b c")
29
+ end
30
+
31
+ it "with underscores" do
32
+ clean_eq("short_title", "short title")
33
+ end
34
+ end
35
+
36
+ it "compacts whitespace around double hyphen" do
37
+ clean_eq("good -- do it", "good do it")
38
+ end
39
+
40
+ it "does not add whitespace when double hypen occurs after end of sentence" do
41
+ clean_eq("ok.--maybe", "ok\nmaybe")
42
+ end
43
+
22
44
  it "treats [,/&] as whitespace" do
23
45
  clean_eq("a,bb&cc/d", "a bb cc d")
24
46
  end
25
47
 
26
48
  it "treats [;:!?] as sentence separators" do
27
- clean_eq("x;y?z!:q", "x.y.z.q", ".")
49
+ clean_eq("x;y?z!:q", "x|y|z|q", "|")
28
50
  end
29
51
 
30
52
  it "joins hyphenated words at line end" do
@@ -35,18 +57,6 @@ describe TextClean do
35
57
  clean_eq("satis- \t \nfaction", "satisfaction")
36
58
  end
37
59
 
38
- it "treats a double hyphen as word separator" do
39
- clean_eq("good--do it", "good do it")
40
- end
41
-
42
- it "compacts whitespace around double hyphen" do
43
- clean_eq("good -- do it", "good do it")
44
- end
45
-
46
- it "treats newlines as word separator" do
47
- clean_eq("a\nb\nc", "a b c")
48
- end
49
-
50
60
  it "ignores numbers" do
51
61
  clean_eq("123abc", "abc")
52
62
  end
@@ -58,4 +68,12 @@ describe TextClean do
58
68
  it "keeps apostrophes" do
59
69
  clean_eq("dad's", "dad's")
60
70
  end
71
+
72
+ it "keeps abbreviated single letters" do
73
+ clean_eq("a mr t. sawyer", "a mr t. sawyer")
74
+ end
75
+
76
+ it "keeps abbreviated single letters that follow other abbreviations without spaces" do
77
+ clean_eq("a mr t.j. sawyer", "a mr t.j. sawyer")
78
+ end
61
79
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_clean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duane Johnson