text_comb 0.0.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +132 -0
- data/Rakefile +4 -0
- data/lib/text_comb.rb +72 -0
- data/lib/text_comb/iterator.rb +18 -0
- data/lib/text_comb/string.rb +9 -0
- data/lib/text_comb/string_extensions.rb +23 -0
- data/lib/text_comb/version.rb +3 -0
- data/test/test_java_interface.rb +48 -0
- data/test/test_textcomb_string.rb +51 -0
- data/text_comb.gemspec +30 -0
- data/vendor/cue.language.jar +0 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e4df656cd44bb7a284b5e98201fb6085e5e91cc0
|
4
|
+
data.tar.gz: 858677d69dfaa6d2c4f2f386e44a41dc7f1f0cbd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83d309b17b501242535177dc601317daca51dc972d65c07d721d5c341b2fe9e60852286a4370d337a82ba4dabbb4a5b8647e4acebdddaeda813cdb71d74e947a
|
7
|
+
data.tar.gz: a13e54b3c74d2ba01f3c1b806b4b7ca1ef5e8915fa3cb00ab36820d8a37d1011eb46ba762956297a39c974e09394c4b36b0fed86deb63c364db909583e17b1b7
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Dan Bernier
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
# Integration Exercise: Java Library Wrapper
|
2
|
+
|
3
|
+
## Exercise Summary
|
4
|
+
|
5
|
+
- You should create a gem using JRuby that wraps an existing Java library.
|
6
|
+
- Your gem should work with a Java library that doesn't already have
|
7
|
+
a good wrapper.
|
8
|
+
- You should make the API for your library look and feel like Ruby, not Java.
|
9
|
+
|
10
|
+
## Wrapping cue.language
|
11
|
+
|
12
|
+
For this assignment, I'm wrapping the
|
13
|
+
[cue.language](https://github.com/vcl/cue.language) library, which
|
14
|
+
handles "counting strings, identifying languages, and removing stop
|
15
|
+
words."
|
16
|
+
|
17
|
+
## How to Use It
|
18
|
+
|
19
|
+
### It's All In the TextComb Module
|
20
|
+
|
21
|
+
There are methods for splitting up text:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
require 'text_comb'
|
25
|
+
|
26
|
+
string = "He must be a little nuts. He is. I mean he just isn't well
|
27
|
+
screwed on is he?"
|
28
|
+
|
29
|
+
p TextComb.words(string).to_a
|
30
|
+
|
31
|
+
# prints:
|
32
|
+
["He", "must", "be", "a", "little", "nuts", "He", "is", "I", "mean"...
|
33
|
+
|
34
|
+
TextComb.sentences(string).each do |sentence|
|
35
|
+
p sentence
|
36
|
+
end
|
37
|
+
|
38
|
+
# prints:
|
39
|
+
"He must be a little nuts. "
|
40
|
+
"He is. "
|
41
|
+
"I mean he just isn't well screwed on is he?"
|
42
|
+
|
43
|
+
TextComb.ngrams(string, 5).each do |ngram|
|
44
|
+
p ngram
|
45
|
+
end
|
46
|
+
|
47
|
+
# prints:
|
48
|
+
"He must be a little"
|
49
|
+
"must be a little nuts"
|
50
|
+
"I mean he just isn't"
|
51
|
+
...
|
52
|
+
```
|
53
|
+
|
54
|
+
TextComb can take a guess at the language, and use the appropriate stop-words:
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
string = "That's a great mustache, grandma!"
|
58
|
+
TextComb.ngrams(string, 2, :stop_words=>:guess).to_a
|
59
|
+
|
60
|
+
# returns:
|
61
|
+
["great mustache", "mustache grandma"]
|
62
|
+
```
|
63
|
+
|
64
|
+
If it picks wrong, and you know what you're dealing with, you can specify:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
TextComb.ngrams(string, 3, :stop_words=>:Croatian).each do |ngram|
|
68
|
+
puts ngram
|
69
|
+
end
|
70
|
+
```
|
71
|
+
|
72
|
+
If you're curious, TextComb will tell you how it guessed:
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
string = "J'ai la moutarde dans ma moustache."
|
76
|
+
TextComb.guess_language(string).to_s # "French"
|
77
|
+
```
|
78
|
+
|
79
|
+
### Mix-in TextComb::StringExtensions
|
80
|
+
|
81
|
+
Extend a string with TextComb::StringExtensions, and it'll have those
|
82
|
+
methods:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
require 'text_comb'
|
86
|
+
|
87
|
+
motto = "I came. I saw. I hacked."
|
88
|
+
motto.extend(TextComb::StringExtensions)
|
89
|
+
|
90
|
+
motto.sentences.to_a # ["I came. ", "I saw. ", "I hacked."]
|
91
|
+
motto.words.to_a.uniq # ["I", "came", "saw", "hacked"]
|
92
|
+
```
|
93
|
+
|
94
|
+
Stop-words for n-grams work the same way, too.
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
string = "I saw red roosters at Ted's farm."
|
98
|
+
string.extend(TextComb::StringExtensions)
|
99
|
+
string.ngrams(2, :stop_words => :English).to_a
|
100
|
+
|
101
|
+
# returns:
|
102
|
+
["saw red", "red roosters", "Ted's farm"]
|
103
|
+
```
|
104
|
+
|
105
|
+
### Make a TextComb::String
|
106
|
+
|
107
|
+
TextComb::String includes TextComb::StringExtensions, but delegates everything
|
108
|
+
else to its string. It's like mixing TextComb::StringExtensions into your
|
109
|
+
own string.
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
require 'text_comb'
|
113
|
+
|
114
|
+
littany = TextComb::String.new("I must not fear.")
|
115
|
+
littany.ngrams(3).to_a # ["I must not", "must not fear"]
|
116
|
+
```
|
117
|
+
|
118
|
+
Even handier, there's the TextComb.string method to save you some
|
119
|
+
finger-tapping.
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
require 'text_comb'
|
123
|
+
littany = TextComb.string("I must not fear.")
|
124
|
+
littany.words.to_a # -> ["I", "must", "not", "fear."]
|
125
|
+
|
126
|
+
littany.guess_language.to_s # -> :English
|
127
|
+
```
|
128
|
+
|
129
|
+
### Future Plans
|
130
|
+
|
131
|
+
- code(TextComb.ngrams) currently yields whole strings - maybe split
|
132
|
+
them into Arrays of words.
|
data/Rakefile
ADDED
data/lib/text_comb.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'java'
|
2
|
+
require_relative '../vendor/cue.language.jar'
|
3
|
+
require_relative 'text_comb/string_extensions'
|
4
|
+
require_relative 'text_comb/string'
|
5
|
+
require_relative 'text_comb/iterator'
|
6
|
+
|
7
|
+
module TextComb
|
8
|
+
|
9
|
+
def self.words(string)
|
10
|
+
enumerate(cue.WordIterator.new(string))
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.sentences(string)
|
14
|
+
enumerate(cue.SentenceIterator.new(string))
|
15
|
+
end
|
16
|
+
|
17
|
+
# TextComb.ngrams(string, 3)
|
18
|
+
# TextComb.ngrams(string, 3, :locale => java.util.Locale.default)
|
19
|
+
# TextComb.ngrams(string, 3, :stop_words => :guess)
|
20
|
+
# TextComb.ngrams(string, 3, :stop_words => :English)
|
21
|
+
# TextComb.ngrams(string, 3, TextComb.guess_language(string))
|
22
|
+
def self.ngrams(string, n, options={})
|
23
|
+
|
24
|
+
locale = options[:locale] || java.util.Locale.default
|
25
|
+
|
26
|
+
stop_words_val = case options[:stop_words]
|
27
|
+
when :guess
|
28
|
+
guess_language(string)
|
29
|
+
when Symbol
|
30
|
+
stop_words(options[:stop_words])
|
31
|
+
when stop.StopWords
|
32
|
+
options[:stop_words]
|
33
|
+
when nil
|
34
|
+
nil
|
35
|
+
else
|
36
|
+
raise "Can't recognize the stop_words: #{options[:stop_words]}"
|
37
|
+
end
|
38
|
+
|
39
|
+
enumerate(cue.NGramIterator.new(n, string, locale, stop_words_val))
|
40
|
+
end
|
41
|
+
|
42
|
+
# TextComb.guess_language "How are you?"
|
43
|
+
def self.guess_language(string)
|
44
|
+
stop.StopWords.guess(string)
|
45
|
+
end
|
46
|
+
|
47
|
+
# TextComb.stop_words :English
|
48
|
+
# TextComb.stop_words :French
|
49
|
+
def self.stop_words(stopwords_symbol)
|
50
|
+
stop.StopWords.const_get(stopwords_symbol)
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# For convenience
|
55
|
+
def self.string(s)
|
56
|
+
TextComb::String.new(s)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
private
|
61
|
+
def self.cue
|
62
|
+
Java::CueLang
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.stop
|
66
|
+
Java::CueLangStop
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.enumerate(iterator)
|
70
|
+
Iterator.new(iterator)
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module TextComb
|
2
|
+
|
3
|
+
module StringExtensions
|
4
|
+
|
5
|
+
def words
|
6
|
+
TextComb.words(self.to_s)
|
7
|
+
end
|
8
|
+
|
9
|
+
def sentences
|
10
|
+
TextComb.sentences(self.to_s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def ngrams(n, options={})
|
14
|
+
TextComb.ngrams(self.to_s, n, options)
|
15
|
+
end
|
16
|
+
|
17
|
+
def guess_language
|
18
|
+
TextComb.guess_language(self.to_s)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'text_comb'
|
3
|
+
|
4
|
+
class TestJavaInterface < MiniTest::Unit::TestCase
|
5
|
+
|
6
|
+
LITTANY = "
|
7
|
+
I must not fear. Fear is the mind-killer. Fear is the little-death
|
8
|
+
that brings total obliteration. I will face my fear. I will permit
|
9
|
+
it to pass over me and through me. And when it has gone past I will
|
10
|
+
turn the inner eye to see its path. Where the fear has gone there
|
11
|
+
will be nothing. Only I will remain.".strip
|
12
|
+
|
13
|
+
def test_each_word
|
14
|
+
expected = %w[I must not fear]
|
15
|
+
|
16
|
+
assert_equal expected, TextComb.words("I must not fear. ").to_a
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_each_sentence
|
20
|
+
|
21
|
+
expected = [
|
22
|
+
"I must not fear. ",
|
23
|
+
"Fear is the mind-killer. ",
|
24
|
+
"Fear is the little-death that brings total obliteration. ",
|
25
|
+
"I will face my fear. ",
|
26
|
+
"I will permit it to pass over me and through me. ",
|
27
|
+
"And when it has gone past I will turn the inner eye to see its path. ",
|
28
|
+
"Where the fear has gone there will be nothing. ",
|
29
|
+
"Only I will remain."
|
30
|
+
]
|
31
|
+
|
32
|
+
assert_equal expected, TextComb.sentences(LITTANY).to_a
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_each_ngram
|
36
|
+
expected = ["I must", "must not", "not fear"]
|
37
|
+
|
38
|
+
assert_equal expected, TextComb.ngrams("I must not fear. ", 2).to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_each_ngram_with_stop_words
|
42
|
+
text = "Fear is the little-death that brings total obliteration."
|
43
|
+
expected = ["brings total obliteration"]
|
44
|
+
ngrams = TextComb.ngrams(text, 3, :stop_words => :English).to_a
|
45
|
+
|
46
|
+
assert_equal expected, ngrams
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'text_comb'
|
3
|
+
|
4
|
+
class TestTextCombString < MiniTest::Unit::TestCase
|
5
|
+
|
6
|
+
def test_can_call_normal_string_methods
|
7
|
+
plain_string = "I came. I saw. I hacked."
|
8
|
+
textcomb = TextComb.string(plain_string)
|
9
|
+
|
10
|
+
assert_equal plain_string.upcase, textcomb.upcase
|
11
|
+
assert_equal plain_string.reverse, textcomb.reverse
|
12
|
+
assert_equal plain_string.gsub(/i/i, "We"), textcomb.gsub(/i/i, "We")
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_can_call_each_word
|
16
|
+
textcomb = TextComb.string("I came. I saw. I hacked.")
|
17
|
+
expected = %w[I came I saw I hacked]
|
18
|
+
|
19
|
+
assert_equal expected, textcomb.words.to_a
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_can_call_each_sentence
|
23
|
+
textcomb = TextComb.string("I came. I saw. I hacked.")
|
24
|
+
expected = ["I came. ", "I saw. ", "I hacked."]
|
25
|
+
|
26
|
+
assert_equal expected, textcomb.sentences.to_a
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_can_call_each_ngram
|
30
|
+
textcomb = TextComb.string("Never wake a sleeping cat.")
|
31
|
+
expected = ["Never wake a", "wake a sleeping", "a sleeping cat"]
|
32
|
+
|
33
|
+
assert_equal expected, textcomb.ngrams(3).to_a
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_ngrams_with_stop_words
|
37
|
+
textcomb = TextComb.string("I saw red roosters at Willy's farm.")
|
38
|
+
expected = ["saw red", "red roosters", "Willy's farm"]
|
39
|
+
|
40
|
+
ngrams = textcomb.ngrams(2, :stop_words => :English).to_a
|
41
|
+
assert_equal expected, ngrams
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_can_guess_its_language
|
45
|
+
textcomb = TextComb.string("I ate all the peanuts, then threw them up.")
|
46
|
+
assert_equal TextComb.stop_words(:English), textcomb.guess_language
|
47
|
+
|
48
|
+
textcomb = TextComb.string("J'ai la moutarde dans ma moustache.")
|
49
|
+
assert_equal TextComb.stop_words(:French), textcomb.guess_language
|
50
|
+
end
|
51
|
+
end
|
data/text_comb.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative "lib/text_comb/version"
|
2
|
+
|
3
|
+
spec = Gem::Specification.new do |s|
|
4
|
+
s.name = 'text_comb'
|
5
|
+
s.version = TextComb::VERSION
|
6
|
+
|
7
|
+
s.summary = %{
|
8
|
+
Extract words, sentences, and n-grams from natural-language text.
|
9
|
+
}.strip
|
10
|
+
|
11
|
+
s.description = %{A Ruby wrapper for the cue.language java library.}
|
12
|
+
|
13
|
+
s.licenses = ['MIT']
|
14
|
+
|
15
|
+
s.files = Dir['lib/**/*.rb'] + Dir['test/**/*.rb'] + %w[
|
16
|
+
LICENSE
|
17
|
+
README.md
|
18
|
+
Rakefile
|
19
|
+
text_comb.gemspec
|
20
|
+
vendor/cue.language.jar
|
21
|
+
]
|
22
|
+
|
23
|
+
s.require_path = 'lib'
|
24
|
+
s.platform = 'java'
|
25
|
+
s.required_ruby_version = ">= 1.9.2"
|
26
|
+
|
27
|
+
s.author = "Dan Bernier"
|
28
|
+
s.email = "danbernier@gmail.com"
|
29
|
+
s.homepage = "https://github.com/danbernier/text_comb"
|
30
|
+
end
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_comb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Dan Bernier
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Ruby wrapper for the cue.language java library.
|
14
|
+
email: danbernier@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- LICENSE
|
20
|
+
- README.md
|
21
|
+
- Rakefile
|
22
|
+
- lib/text_comb.rb
|
23
|
+
- lib/text_comb/iterator.rb
|
24
|
+
- lib/text_comb/string.rb
|
25
|
+
- lib/text_comb/string_extensions.rb
|
26
|
+
- lib/text_comb/version.rb
|
27
|
+
- test/test_java_interface.rb
|
28
|
+
- test/test_textcomb_string.rb
|
29
|
+
- text_comb.gemspec
|
30
|
+
- vendor/cue.language.jar
|
31
|
+
homepage: https://github.com/danbernier/text_comb
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.9.2
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.4.5
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Extract words, sentences, and n-grams from natural-language text.
|
55
|
+
test_files: []
|