RubyGems - text_clean - Versions diffs - 0.2.0 → 0.2.1 - Mend

text_clean 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/ext/text_clean/text_clean.cc +21 -9
data/lib/text_clean/version.rb +1 -1
data/spec/text_clean_spec.rb +31 -13
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2e529b66807a01385f10fc896e173880ebe10925
-  data.tar.gz: d4c095af593c650788cc54795ca12a76c695a652
+  metadata.gz: bb04963f984de57939786cb051eb1096f04fa416
+  data.tar.gz: 1f811f6b548b3e267f962d327b63bb9df8aef61d
 SHA512:
-  metadata.gz: 8c45e5724b4f8e5598fdeee84869564b737e9ac77ff30c08e756fec9adec5f148f150cf5026dacd0dd16e011357ff02c02d4ceeb61b0a3725400b7e23a98b786
-  data.tar.gz: 15ff835dff5b83c95e71d87e4d483ac54bf9978c8c66d38e67d738b577c0a24a140c5e38528fd5fd0c6af8f60a5c5ddb057b40c43012cc0bf845d37fcd6a1c7d
+  metadata.gz: 7f9e02f59b7900ea3ff595fd0f07a344a70913a9a1f0ee484fe46b02e4459ee1a749fc554c220fe0573563467f96aaa9a2fb7c32fee64bf2b3e106be8cd0e3da
+  data.tar.gz: 77f6299a203f3129269ca29fa817dcbb935eaa2c1a55229eab24927e9ad10dad6b4533f5c3f1222ea2a878dd52fe382a59f07e8b0e82720808fce0f2071da9a5

data/ext/text_clean/text_clean.cc CHANGED Viewed

@@ -27,13 +27,13 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
   char* read;
   char* write = text;
   uint8_t just_added_space = true,   // prevent prefix spaces
-          just_added_period = false;
+          just_added_line_sep = false;
   for (read = text; read < eos; read++) {
     char c = *read;
     if (c >= 'A' && c <= 'Z') {
       // Change upper case to lowercase
       c += 32;
-    } else if (c == '\t' || c == ',' || c == '&' || c == '/') {
+    } else if (c == '\t' || c == '_' || c == ',' || c == '&' || c == '/') {
       // Change inconsequential punctuation to spaces (i.e. all count as whitespace)
       c = ' ';
     } else if (c == '?' || c == '!' || c == ':' || c == ';') {
@@ -45,11 +45,11 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
     if (c == '-') {
       // double dash?
       if (*(read + 1) == '-') {
-        if (!just_added_space) {
+        if (!just_added_space && !just_added_line_sep) {
           *write++ = ' ';
           read++;
           just_added_space = true;
-          just_added_period = false;
+          just_added_line_sep = false;
         }
       } else {
         // scan ahead to see if this hyphen is at the end of the line
@@ -68,20 +68,32 @@ size_t text_clean_cstr(char* text, long len, char line_sep)
           }
         }
       }
-    } else if (c == '.' && !just_added_period) {
+    } else if (c == '.' && !just_added_line_sep) {
+      // look-behind and see if this is an abbreviation
+      if (write - text >= 2) {
+        char a = *(write - 2);
+        char b = *(write - 1);
+        // we're just checking for single-letter abbrevs, so see if 2-chars-behind is whitespace
+        if (a == ' ' || a == '.' || a == '\n' || a == '\t') {
+          *write++ = '.';
+          continue;
+        }
+      }
       // erase space before period
       if (just_added_space) write--;
       *write++ = line_sep;
-      just_added_period = true;
+      just_added_line_sep = true;
       just_added_space = false;
-    } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_period) {
+    } else if ((c == ' ' || c == '\n') && !just_added_space && !just_added_line_sep) {
       *write++ = ' ';
       just_added_space = true;
-      just_added_period = false;
+      just_added_line_sep = false;
     } else if (c == '\'' || (c >= 'a' && c <= 'z')) {
       *write++ = c;
       just_added_space = false;
-      just_added_period = false;
+      just_added_line_sep = false;
     }
   }
   // erase space at end of text

data/lib/text_clean/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TextClean
-  VERSION = '0.2.0'
+  VERSION = '0.2.1'
 end

data/spec/text_clean_spec.rb CHANGED Viewed

@@ -19,12 +19,34 @@ describe TextClean do
     clean_eq("is\t it    so?", "is it so\n")
   end
+  context "separates words" do
+    it "with double hyphen" do
+      clean_eq("good--do it", "good do it")
+    end
+    it "with newlines" do
+      clean_eq("a\nb\nc", "a b c")
+    end
+    it "with underscores" do
+      clean_eq("short_title", "short title")
+    end
+  end
+  it "compacts whitespace around double hyphen" do
+    clean_eq("good -- do it", "good do it")
+  end
+  it "does not add whitespace when double hypen occurs after end of sentence" do
+    clean_eq("ok.--maybe", "ok\nmaybe")
+  end
   it "treats [,/&] as whitespace" do
     clean_eq("a,bb&cc/d", "a bb cc d")
   end
   it "treats [;:!?] as sentence separators" do
-    clean_eq("x;y?z!:q", "x.y.z.q", ".")
+    clean_eq("x;y?z!:q", "x|y|z|q", "|")
   end
   it "joins hyphenated words at line end" do
@@ -35,18 +57,6 @@ describe TextClean do
     clean_eq("satis- \t \nfaction", "satisfaction")
   end
-  it "treats a double hyphen as word separator" do
-    clean_eq("good--do it", "good do it")
-  end
-  it "compacts whitespace around double hyphen" do
-    clean_eq("good -- do it", "good do it")
-  end
-  it "treats newlines as word separator" do
-    clean_eq("a\nb\nc", "a b c")
-  end
   it "ignores numbers" do
     clean_eq("123abc", "abc")
   end
@@ -58,4 +68,12 @@ describe TextClean do
   it "keeps apostrophes" do
     clean_eq("dad's", "dad's")
   end
+  it "keeps abbreviated single letters" do
+    clean_eq("a mr t. sawyer", "a mr t. sawyer")
+  end
+  it "keeps abbreviated single letters that follow other abbreviations without spaces" do
+    clean_eq("a mr t.j. sawyer", "a mr t.j. sawyer")
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: text_clean
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Duane Johnson