text_clean 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 631e9068f2dffa8f14377a682868a03732144d00
4
+ data.tar.gz: 9aa6e9bb2d6ede27d7bca149da79e077f9a038e8
5
+ SHA512:
6
+ metadata.gz: e55cad056b49d56c812df735ef6961dad3be08de91c63026cc1129e76cd90e96de0f9c12ca46928530c3846450a5498c214aabbed64991cea3e8a6a891ecc0cc
7
+ data.tar.gz: 139483afa8b4cb5204361e22430305196bca352278f23a80eafc1d711732f02b0fac96ffb589782135b77af8d63bb10c255d20ea0fc1ff78800b8084d7a657d5
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
@@ -0,0 +1,6 @@
1
+ require "rake/extensiontask"
2
+
3
+ Rake::ExtensionTask.new "text_clean" do |ext|
4
+ ext.lib_dir = "lib/text_clean"
5
+ ext.source_pattern = "*.{c,cc}"
6
+ end
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "text_clean"
4
+
5
+ puts TextClean.clean(STDIN.read, "\n")
@@ -0,0 +1,24 @@
1
+ # replace newlines with periods to make regexes easier
2
+ tr '\n' '.' |
3
+ # convert sentence endings to periods to be consistent
4
+ tr ':' '.' |
5
+
6
+ tr ';' '.' |
7
+
8
+ tr '?' '.' |
9
+
10
+ tr '!' '.' |
11
+ # compact line endings
12
+ sed -E $'s/\.+/./g' |
13
+ # join lines ending in hyphenation
14
+ sed -E $'s/- *\.//g' |
15
+ # replace periods with newlines
16
+ sed -E $'s/[ \\\t]*\.[ \\\t]*/\\\n/g' |
17
+ # compact whitespace-line characters
18
+ sed -E $'s/[&\/, \\\t\-]+/ /g' |
19
+ # remove non-alpha characters
20
+ sed -E $'s/[^a-zA-Z\' ]//g' |
21
+ # strip leading whitespace
22
+ sed -E $'s/^ *//' |
23
+ # convert to lowercase
24
+ tr '[:upper:]' '[:lower:]'
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+
3
+ dir_config("text_clean")
4
+
5
+ create_makefile("text_clean")
@@ -0,0 +1,116 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <stdio.h>
4
+ #include <string.h>
5
+ #include <assert.h>
6
+
7
+ static rb_encoding* u8_enc;
8
+ static rb_encoding* bin_enc;
9
+
10
+ /** Transforms text such as the following:
11
+ *
12
+ * And behold, I said, "This is no good!"
13
+ * What shall ye say unto these people, there-
14
+ * fore?
15
+ *
16
+ * Into a cleaned up single line of text, like the following:
17
+ *
18
+ * and behold i said this is no good.what shall ye say unto these people therefore.
19
+ *
20
+ * Spaces indicate word boundaries, while periods indicate sentence boundaries.
21
+ */
22
+ size_t text_clean_cstr(char* text, long len, char line_sep)
23
+ {
24
+ if (len <= 0) return 0;
25
+
26
+ char* eos = text + (size_t)len;
27
+ char* read;
28
+ char* write = text;
29
+ uint8_t just_added_space = true, // prevent prefix spaces
30
+ just_added_period = false;
31
+ for (read = text; read < eos; read++) {
32
+ char c = *read;
33
+ if (c >= 'A' && c <= 'Z') {
34
+ // Change upper case to lowercase
35
+ c += 32;
36
+ } else if (c == '\t' || c == ',' || c == '&' || c == '/') {
37
+ // Change inconsequential punctuation to spaces (i.e. all count as whitespace)
38
+ c = ' ';
39
+ } else if (c == '?' || c == '!' || c == ':' || c == ';') {
40
+ // Change exclamation, question marks to periods (i.e. sentence boundaries)
41
+ c = '.';
42
+ }
43
+
44
+ // hyphen at end of line joins word fragments
45
+ if (c == '-') {
46
+ // double dash?
47
+ if (*(read + 1) == '-') {
48
+ *write++ = ' ';
49
+ read++;
50
+ } else {
51
+ // scan ahead to see if this is a hyphen at the end of the line
52
+ char* scan_ahead;
53
+ for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
54
+ char s = *scan_ahead;
55
+ if (s != '\t' && s != ' ') {
56
+ if (s == '\n') {
57
+ // this is a hyphenated line join, so join the lines
58
+ read = scan_ahead;
59
+ break;
60
+ } else {
61
+ // not a line join
62
+ break;
63
+ }
64
+ }
65
+ }
66
+ }
67
+ } else if (c == '.' && !just_added_period) {
68
+ // erase space before period
69
+ if (just_added_space) write--;
70
+ *write++ = line_sep;
71
+ just_added_period = true;
72
+ just_added_space = false;
73
+ } else if (c == ' ' && !just_added_space && !just_added_period) {
74
+ *write++ = ' ';
75
+ just_added_space = true;
76
+ just_added_period = false;
77
+ } else if (c >= 'a' && c <= 'z') {
78
+ *write++ = c;
79
+ just_added_space = false;
80
+ just_added_period = false;
81
+ }
82
+ }
83
+ // erase space at end of text
84
+ if (just_added_space) write--;
85
+
86
+ // Return the new length of the string
87
+ return (size_t)(write - text);
88
+ }
89
+
90
+ static VALUE text_clean(VALUE self, VALUE text, VALUE ending) {
91
+ char* ptext = RSTRING_PTR(text);
92
+ long len = RSTRING_LEN(text);
93
+ char line_sep = '.';
94
+
95
+ if (RSTRING_LEN(ending) == 1) {
96
+ line_sep = RSTRING_PTR(ending)[0];
97
+ }
98
+
99
+ rb_str_modify(text);
100
+
101
+ size_t new_length = text_clean_cstr(ptext, len, line_sep);
102
+
103
+ rb_str_set_len(text, (long)new_length);
104
+
105
+ return text;
106
+ }
107
+
108
+ extern "C"
109
+ void Init_text_clean() {
110
+ VALUE rb_mText = rb_define_module("TextClean");
111
+
112
+ u8_enc = rb_utf8_encoding();
113
+ bin_enc = rb_ascii8bit_encoding();
114
+
115
+ rb_define_module_function(rb_mText, "text_clean", RUBY_METHOD_FUNC(text_clean), 2);
116
+ }
@@ -0,0 +1,9 @@
1
+ require "text_clean/text_clean"
2
+ require "text_clean/version"
3
+
4
+ module TextClean
5
+ def self.clean(text, line_sep = "\n")
6
+ # Call the C function
7
+ text_clean(text, line_sep)
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module TextClean
2
+ VERSION = '0.1'
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'text_clean/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "text_clean"
8
+ spec.version = TextClean::VERSION
9
+ spec.authors = ["Duane Johnson"]
10
+ spec.email = ["duane.johnson@gmail.com"]
11
+ spec.description = %q{Cleans text by removing punctuation, lowercasing. Very fast.}
12
+ spec.summary = %q{Text cleaner}
13
+ spec.homepage = "https://github.com/wordtreefoundation"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib", "ext"]
20
+
21
+ spec.extensions = %w[ext/text_clean/extconf.rb]
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake", "~> 10.3"
25
+ spec.add_development_dependency "rake-compiler", "~> 0.9"
26
+ spec.add_development_dependency "byebug", "~> 3.4"
27
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_clean
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Duane Johnson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.9'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.4'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.4'
69
+ description: Cleans text by removing punctuation, lowercasing. Very fast.
70
+ email:
71
+ - duane.johnson@gmail.com
72
+ executables:
73
+ - text_clean
74
+ - text_clean.sh
75
+ extensions:
76
+ - ext/text_clean/extconf.rb
77
+ extra_rdoc_files: []
78
+ files:
79
+ - Gemfile
80
+ - Rakefile
81
+ - bin/text_clean
82
+ - bin/text_clean.sh
83
+ - ext/text_clean/extconf.rb
84
+ - ext/text_clean/text_clean.cc
85
+ - lib/text_clean.rb
86
+ - lib/text_clean/version.rb
87
+ - text_clean.gemspec
88
+ homepage: https://github.com/wordtreefoundation
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ - ext
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Text cleaner
113
+ test_files: []