text_clean 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 631e9068f2dffa8f14377a682868a03732144d00
4
+ data.tar.gz: 9aa6e9bb2d6ede27d7bca149da79e077f9a038e8
5
+ SHA512:
6
+ metadata.gz: e55cad056b49d56c812df735ef6961dad3be08de91c63026cc1129e76cd90e96de0f9c12ca46928530c3846450a5498c214aabbed64991cea3e8a6a891ecc0cc
7
+ data.tar.gz: 139483afa8b4cb5204361e22430305196bca352278f23a80eafc1d711732f02b0fac96ffb589782135b77af8d63bb10c255d20ea0fc1ff78800b8084d7a657d5
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
@@ -0,0 +1,6 @@
1
+ require "rake/extensiontask"
2
+
3
+ Rake::ExtensionTask.new "text_clean" do |ext|
4
+ ext.lib_dir = "lib/text_clean"
5
+ ext.source_pattern = "*.{c,cc}"
6
+ end
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "text_clean"
4
+
5
+ puts TextClean.clean(STDIN.read, "\n")
@@ -0,0 +1,24 @@
1
+ # replace newlines with periods to make regexes easier
2
+ tr '\n' '.' |
3
+ # convert sentence endings to periods to be consistent
4
+ tr ':' '.' |
5
+
6
+ tr ';' '.' |
7
+
8
+ tr '?' '.' |
9
+
10
+ tr '!' '.' |
11
+ # compact line endings
12
+ sed -E $'s/\.+/./g' |
13
+ # join lines ending in hyphenation
14
+ sed -E $'s/- *\.//g' |
15
+ # replace periods with newlines
16
+ sed -E $'s/[ \\\t]*\.[ \\\t]*/\\\n/g' |
17
+ # compact whitespace-line characters
18
+ sed -E $'s/[&\/, \\\t\-]+/ /g' |
19
+ # remove non-alpha characters
20
+ sed -E $'s/[^a-zA-Z\' ]//g' |
21
+ # strip leading whitespace
22
+ sed -E $'s/^ *//' |
23
+ # convert to lowercase
24
+ tr '[:upper:]' '[:lower:]'
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+
3
+ dir_config("text_clean")
4
+
5
+ create_makefile("text_clean")
@@ -0,0 +1,116 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <stdio.h>
4
+ #include <string.h>
5
+ #include <assert.h>
6
+
7
+ static rb_encoding* u8_enc;
8
+ static rb_encoding* bin_enc;
9
+
10
+ /** Transforms text such as the following:
11
+ *
12
+ * And behold, I said, "This is no good!"
13
+ * What shall ye say unto these people, there-
14
+ * fore?
15
+ *
16
+ * Into a cleaned up single line of text, like the following:
17
+ *
18
+ * and behold i said this is no good.what shall ye say unto these people therefore.
19
+ *
20
+ * Spaces indicate word boundaries, while periods indicate sentence boundaries.
21
+ */
22
+ size_t text_clean_cstr(char* text, long len, char line_sep)
23
+ {
24
+ if (len <= 0) return 0;
25
+
26
+ char* eos = text + (size_t)len;
27
+ char* read;
28
+ char* write = text;
29
+ uint8_t just_added_space = true, // prevent prefix spaces
30
+ just_added_period = false;
31
+ for (read = text; read < eos; read++) {
32
+ char c = *read;
33
+ if (c >= 'A' && c <= 'Z') {
34
+ // Change upper case to lowercase
35
+ c += 32;
36
+ } else if (c == '\t' || c == ',' || c == '&' || c == '/') {
37
+ // Change inconsequential punctuation to spaces (i.e. all count as whitespace)
38
+ c = ' ';
39
+ } else if (c == '?' || c == '!' || c == ':' || c == ';') {
40
+ // Change exclamation, question marks to periods (i.e. sentence boundaries)
41
+ c = '.';
42
+ }
43
+
44
+ // hyphen at end of line joins word fragments
45
+ if (c == '-') {
46
+ // double dash?
47
+ if (*(read + 1) == '-') {
48
+ *write++ = ' ';
49
+ read++;
50
+ } else {
51
+ // scan ahead to see if this is a hyphen at the end of the line
52
+ char* scan_ahead;
53
+ for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
54
+ char s = *scan_ahead;
55
+ if (s != '\t' && s != ' ') {
56
+ if (s == '\n') {
57
+ // this is a hyphenated line join, so join the lines
58
+ read = scan_ahead;
59
+ break;
60
+ } else {
61
+ // not a line join
62
+ break;
63
+ }
64
+ }
65
+ }
66
+ }
67
+ } else if (c == '.' && !just_added_period) {
68
+ // erase space before period
69
+ if (just_added_space) write--;
70
+ *write++ = line_sep;
71
+ just_added_period = true;
72
+ just_added_space = false;
73
+ } else if (c == ' ' && !just_added_space && !just_added_period) {
74
+ *write++ = ' ';
75
+ just_added_space = true;
76
+ just_added_period = false;
77
+ } else if (c >= 'a' && c <= 'z') {
78
+ *write++ = c;
79
+ just_added_space = false;
80
+ just_added_period = false;
81
+ }
82
+ }
83
+ // erase space at end of text
84
+ if (just_added_space) write--;
85
+
86
+ // Return the new length of the string
87
+ return (size_t)(write - text);
88
+ }
89
+
90
+ static VALUE text_clean(VALUE self, VALUE text, VALUE ending) {
91
+ char* ptext = RSTRING_PTR(text);
92
+ long len = RSTRING_LEN(text);
93
+ char line_sep = '.';
94
+
95
+ if (RSTRING_LEN(ending) == 1) {
96
+ line_sep = RSTRING_PTR(ending)[0];
97
+ }
98
+
99
+ rb_str_modify(text);
100
+
101
+ size_t new_length = text_clean_cstr(ptext, len, line_sep);
102
+
103
+ rb_str_set_len(text, (long)new_length);
104
+
105
+ return text;
106
+ }
107
+
108
+ extern "C"
109
+ void Init_text_clean() {
110
+ VALUE rb_mText = rb_define_module("TextClean");
111
+
112
+ u8_enc = rb_utf8_encoding();
113
+ bin_enc = rb_ascii8bit_encoding();
114
+
115
+ rb_define_module_function(rb_mText, "text_clean", RUBY_METHOD_FUNC(text_clean), 2);
116
+ }
@@ -0,0 +1,9 @@
1
+ require "text_clean/text_clean"
2
+ require "text_clean/version"
3
+
4
+ module TextClean
5
+ def self.clean(text, line_sep = "\n")
6
+ # Call the C function
7
+ text_clean(text, line_sep)
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module TextClean
2
+ VERSION = '0.1'
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'text_clean/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "text_clean"
8
+ spec.version = TextClean::VERSION
9
+ spec.authors = ["Duane Johnson"]
10
+ spec.email = ["duane.johnson@gmail.com"]
11
+ spec.description = %q{Cleans text by removing punctuation, lowercasing. Very fast.}
12
+ spec.summary = %q{Text cleaner}
13
+ spec.homepage = "https://github.com/wordtreefoundation"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib", "ext"]
20
+
21
+ spec.extensions = %w[ext/text_clean/extconf.rb]
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake", "~> 10.3"
25
+ spec.add_development_dependency "rake-compiler", "~> 0.9"
26
+ spec.add_development_dependency "byebug", "~> 3.4"
27
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_clean
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Duane Johnson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.9'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.4'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.4'
69
+ description: Cleans text by removing punctuation, lowercasing. Very fast.
70
+ email:
71
+ - duane.johnson@gmail.com
72
+ executables:
73
+ - text_clean
74
+ - text_clean.sh
75
+ extensions:
76
+ - ext/text_clean/extconf.rb
77
+ extra_rdoc_files: []
78
+ files:
79
+ - Gemfile
80
+ - Rakefile
81
+ - bin/text_clean
82
+ - bin/text_clean.sh
83
+ - ext/text_clean/extconf.rb
84
+ - ext/text_clean/text_clean.cc
85
+ - lib/text_clean.rb
86
+ - lib/text_clean/version.rb
87
+ - text_clean.gemspec
88
+ homepage: https://github.com/wordtreefoundation
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ - ext
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Text cleaner
113
+ test_files: []