text_clean 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/Rakefile +6 -0
- data/bin/text_clean +5 -0
- data/bin/text_clean.sh +24 -0
- data/ext/text_clean/extconf.rb +5 -0
- data/ext/text_clean/text_clean.cc +116 -0
- data/lib/text_clean.rb +9 -0
- data/lib/text_clean/version.rb +3 -0
- data/text_clean.gemspec +27 -0
- metadata +113 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 631e9068f2dffa8f14377a682868a03732144d00
|
4
|
+
data.tar.gz: 9aa6e9bb2d6ede27d7bca149da79e077f9a038e8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e55cad056b49d56c812df735ef6961dad3be08de91c63026cc1129e76cd90e96de0f9c12ca46928530c3846450a5498c214aabbed64991cea3e8a6a891ecc0cc
|
7
|
+
data.tar.gz: 139483afa8b4cb5204361e22430305196bca352278f23a80eafc1d711732f02b0fac96ffb589782135b77af8d63bb10c255d20ea0fc1ff78800b8084d7a657d5
|
data/Gemfile
ADDED
data/Rakefile
ADDED
data/bin/text_clean
ADDED
data/bin/text_clean.sh
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# replace newlines with periods to make regexes easier
|
2
|
+
tr '\n' '.' |
|
3
|
+
# convert sentence endings to periods to be consistent
|
4
|
+
tr ':' '.' |
|
5
|
+
|
6
|
+
tr ';' '.' |
|
7
|
+
|
8
|
+
tr '?' '.' |
|
9
|
+
|
10
|
+
tr '!' '.' |
|
11
|
+
# compact line endings
|
12
|
+
sed -E $'s/\.+/./g' |
|
13
|
+
# join lines ending in hyphenation
|
14
|
+
sed -E $'s/- *\.//g' |
|
15
|
+
# replace periods with newlines
|
16
|
+
sed -E $'s/[ \\\t]*\.[ \\\t]*/\\\n/g' |
|
17
|
+
# compact whitespace-line characters
|
18
|
+
sed -E $'s/[&\/, \\\t\-]+/ /g' |
|
19
|
+
# remove non-alpha characters
|
20
|
+
sed -E $'s/[^a-zA-Z\' ]//g' |
|
21
|
+
# strip leading whitespace
|
22
|
+
sed -E $'s/^ *//' |
|
23
|
+
# convert to lowercase
|
24
|
+
tr '[:upper:]' '[:lower:]'
|
@@ -0,0 +1,116 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <string.h>
|
5
|
+
#include <assert.h>
|
6
|
+
|
7
|
+
static rb_encoding* u8_enc;
|
8
|
+
static rb_encoding* bin_enc;
|
9
|
+
|
10
|
+
/** Transforms text such as the following:
|
11
|
+
*
|
12
|
+
* And behold, I said, "This is no good!"
|
13
|
+
* What shall ye say unto these people, there-
|
14
|
+
* fore?
|
15
|
+
*
|
16
|
+
* Into a cleaned up single line of text, like the following:
|
17
|
+
*
|
18
|
+
* and behold i said this is no good.what shall ye say unto these people therefore.
|
19
|
+
*
|
20
|
+
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
21
|
+
*/
|
22
|
+
size_t text_clean_cstr(char* text, long len, char line_sep)
|
23
|
+
{
|
24
|
+
if (len <= 0) return 0;
|
25
|
+
|
26
|
+
char* eos = text + (size_t)len;
|
27
|
+
char* read;
|
28
|
+
char* write = text;
|
29
|
+
uint8_t just_added_space = true, // prevent prefix spaces
|
30
|
+
just_added_period = false;
|
31
|
+
for (read = text; read < eos; read++) {
|
32
|
+
char c = *read;
|
33
|
+
if (c >= 'A' && c <= 'Z') {
|
34
|
+
// Change upper case to lowercase
|
35
|
+
c += 32;
|
36
|
+
} else if (c == '\t' || c == ',' || c == '&' || c == '/') {
|
37
|
+
// Change inconsequential punctuation to spaces (i.e. all count as whitespace)
|
38
|
+
c = ' ';
|
39
|
+
} else if (c == '?' || c == '!' || c == ':' || c == ';') {
|
40
|
+
// Change exclamation, question marks to periods (i.e. sentence boundaries)
|
41
|
+
c = '.';
|
42
|
+
}
|
43
|
+
|
44
|
+
// hyphen at end of line joins word fragments
|
45
|
+
if (c == '-') {
|
46
|
+
// double dash?
|
47
|
+
if (*(read + 1) == '-') {
|
48
|
+
*write++ = ' ';
|
49
|
+
read++;
|
50
|
+
} else {
|
51
|
+
// scan ahead to see if this is a hyphen at the end of the line
|
52
|
+
char* scan_ahead;
|
53
|
+
for (scan_ahead = read + 1; scan_ahead < eos; scan_ahead++) {
|
54
|
+
char s = *scan_ahead;
|
55
|
+
if (s != '\t' && s != ' ') {
|
56
|
+
if (s == '\n') {
|
57
|
+
// this is a hyphenated line join, so join the lines
|
58
|
+
read = scan_ahead;
|
59
|
+
break;
|
60
|
+
} else {
|
61
|
+
// not a line join
|
62
|
+
break;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}
|
67
|
+
} else if (c == '.' && !just_added_period) {
|
68
|
+
// erase space before period
|
69
|
+
if (just_added_space) write--;
|
70
|
+
*write++ = line_sep;
|
71
|
+
just_added_period = true;
|
72
|
+
just_added_space = false;
|
73
|
+
} else if (c == ' ' && !just_added_space && !just_added_period) {
|
74
|
+
*write++ = ' ';
|
75
|
+
just_added_space = true;
|
76
|
+
just_added_period = false;
|
77
|
+
} else if (c >= 'a' && c <= 'z') {
|
78
|
+
*write++ = c;
|
79
|
+
just_added_space = false;
|
80
|
+
just_added_period = false;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
// erase space at end of text
|
84
|
+
if (just_added_space) write--;
|
85
|
+
|
86
|
+
// Return the new length of the string
|
87
|
+
return (size_t)(write - text);
|
88
|
+
}
|
89
|
+
|
90
|
+
static VALUE text_clean(VALUE self, VALUE text, VALUE ending) {
|
91
|
+
char* ptext = RSTRING_PTR(text);
|
92
|
+
long len = RSTRING_LEN(text);
|
93
|
+
char line_sep = '.';
|
94
|
+
|
95
|
+
if (RSTRING_LEN(ending) == 1) {
|
96
|
+
line_sep = RSTRING_PTR(ending)[0];
|
97
|
+
}
|
98
|
+
|
99
|
+
rb_str_modify(text);
|
100
|
+
|
101
|
+
size_t new_length = text_clean_cstr(ptext, len, line_sep);
|
102
|
+
|
103
|
+
rb_str_set_len(text, (long)new_length);
|
104
|
+
|
105
|
+
return text;
|
106
|
+
}
|
107
|
+
|
108
|
+
extern "C"
|
109
|
+
void Init_text_clean() {
|
110
|
+
VALUE rb_mText = rb_define_module("TextClean");
|
111
|
+
|
112
|
+
u8_enc = rb_utf8_encoding();
|
113
|
+
bin_enc = rb_ascii8bit_encoding();
|
114
|
+
|
115
|
+
rb_define_module_function(rb_mText, "text_clean", RUBY_METHOD_FUNC(text_clean), 2);
|
116
|
+
}
|
data/lib/text_clean.rb
ADDED
data/text_clean.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'text_clean/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "text_clean"
|
8
|
+
spec.version = TextClean::VERSION
|
9
|
+
spec.authors = ["Duane Johnson"]
|
10
|
+
spec.email = ["duane.johnson@gmail.com"]
|
11
|
+
spec.description = %q{Cleans text by removing punctuation, lowercasing. Very fast.}
|
12
|
+
spec.summary = %q{Text cleaner}
|
13
|
+
spec.homepage = "https://github.com/wordtreefoundation"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib", "ext"]
|
20
|
+
|
21
|
+
spec.extensions = %w[ext/text_clean/extconf.rb]
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake", "~> 10.3"
|
25
|
+
spec.add_development_dependency "rake-compiler", "~> 0.9"
|
26
|
+
spec.add_development_dependency "byebug", "~> 3.4"
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_clean
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Duane Johnson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.9'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.9'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: byebug
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.4'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.4'
|
69
|
+
description: Cleans text by removing punctuation, lowercasing. Very fast.
|
70
|
+
email:
|
71
|
+
- duane.johnson@gmail.com
|
72
|
+
executables:
|
73
|
+
- text_clean
|
74
|
+
- text_clean.sh
|
75
|
+
extensions:
|
76
|
+
- ext/text_clean/extconf.rb
|
77
|
+
extra_rdoc_files: []
|
78
|
+
files:
|
79
|
+
- Gemfile
|
80
|
+
- Rakefile
|
81
|
+
- bin/text_clean
|
82
|
+
- bin/text_clean.sh
|
83
|
+
- ext/text_clean/extconf.rb
|
84
|
+
- ext/text_clean/text_clean.cc
|
85
|
+
- lib/text_clean.rb
|
86
|
+
- lib/text_clean/version.rb
|
87
|
+
- text_clean.gemspec
|
88
|
+
homepage: https://github.com/wordtreefoundation
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
- ext
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.2.2
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: Text cleaner
|
113
|
+
test_files: []
|