text_comb 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +132 -0
- data/Rakefile +4 -0
- data/lib/text_comb.rb +72 -0
- data/lib/text_comb/iterator.rb +18 -0
- data/lib/text_comb/string.rb +9 -0
- data/lib/text_comb/string_extensions.rb +23 -0
- data/lib/text_comb/version.rb +3 -0
- data/test/test_java_interface.rb +48 -0
- data/test/test_textcomb_string.rb +51 -0
- data/text_comb.gemspec +30 -0
- data/vendor/cue.language.jar +0 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e4df656cd44bb7a284b5e98201fb6085e5e91cc0
|
4
|
+
data.tar.gz: 858677d69dfaa6d2c4f2f386e44a41dc7f1f0cbd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83d309b17b501242535177dc601317daca51dc972d65c07d721d5c341b2fe9e60852286a4370d337a82ba4dabbb4a5b8647e4acebdddaeda813cdb71d74e947a
|
7
|
+
data.tar.gz: a13e54b3c74d2ba01f3c1b806b4b7ca1ef5e8915fa3cb00ab36820d8a37d1011eb46ba762956297a39c974e09394c4b36b0fed86deb63c364db909583e17b1b7
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Dan Bernier
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
# Integration Exercise: Java Library Wrapper
|
2
|
+
|
3
|
+
## Exercise Summary
|
4
|
+
|
5
|
+
- You should create a gem using JRuby that wraps an existing Java library.
|
6
|
+
- Your gem should work with a Java library that doesn't already have
|
7
|
+
a good wrapper.
|
8
|
+
- You should make the API for your library look and feel like Ruby, not Java.
|
9
|
+
|
10
|
+
## Wrapping cue.language
|
11
|
+
|
12
|
+
For this assignment, I'm wrapping the
|
13
|
+
[cue.language](https://github.com/vcl/cue.language) library, which
|
14
|
+
handles "counting strings, identifying languages, and removing stop
|
15
|
+
words."
|
16
|
+
|
17
|
+
## How to Use It
|
18
|
+
|
19
|
+
### It's All In the TextComb Module
|
20
|
+
|
21
|
+
There are methods for splitting up text:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
require 'text_comb'
|
25
|
+
|
26
|
+
string = "He must be a little nuts. He is. I mean he just isn't well
|
27
|
+
screwed on is he?"
|
28
|
+
|
29
|
+
p TextComb.words(string).to_a
|
30
|
+
|
31
|
+
# prints:
|
32
|
+
["He", "must", "be", "a", "little", "nuts", "He", "is", "I", "mean"...
|
33
|
+
|
34
|
+
TextComb.sentences(string).each do |sentence|
|
35
|
+
p sentence
|
36
|
+
end
|
37
|
+
|
38
|
+
# prints:
|
39
|
+
"He must be a little nuts. "
|
40
|
+
"He is. "
|
41
|
+
"I mean he just isn't well screwed on is he?"
|
42
|
+
|
43
|
+
TextComb.ngrams(string, 5).each do |ngram|
|
44
|
+
p ngram
|
45
|
+
end
|
46
|
+
|
47
|
+
# prints:
|
48
|
+
"He must be a little"
|
49
|
+
"must be a little nuts"
|
50
|
+
"I mean he just isn't"
|
51
|
+
...
|
52
|
+
```
|
53
|
+
|
54
|
+
TextComb can take a guess at the language, and use the appropriate stop-words:
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
string = "That's a great mustache, grandma!"
|
58
|
+
TextComb.ngrams(string, 2, :stop_words=>:guess).to_a
|
59
|
+
|
60
|
+
# returns:
|
61
|
+
["great mustache", "mustache grandma"]
|
62
|
+
```
|
63
|
+
|
64
|
+
If it picks wrong, and you know what you're dealing with, you can specify:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
TextComb.ngrams(string, 3, :stop_words=>:Croatian).each do |ngram|
|
68
|
+
puts ngram
|
69
|
+
end
|
70
|
+
```
|
71
|
+
|
72
|
+
If you're curious, TextComb will tell you how it guessed:
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
string = "J'ai la moutarde dans ma moustache."
|
76
|
+
TextComb.guess_language(string).to_s # "French"
|
77
|
+
```
|
78
|
+
|
79
|
+
### Mix-in TextComb::StringExtensions
|
80
|
+
|
81
|
+
Extend a string with TextComb::StringExtensions, and it'll have those
|
82
|
+
methods:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
require 'text_comb'
|
86
|
+
|
87
|
+
motto = "I came. I saw. I hacked."
|
88
|
+
motto.extend(TextComb::StringExtensions)
|
89
|
+
|
90
|
+
motto.sentences.to_a # ["I came. ", "I saw. ", "I hacked."]
|
91
|
+
motto.words.to_a.uniq # ["I", "came", "saw", "hacked"]
|
92
|
+
```
|
93
|
+
|
94
|
+
Stop-words for n-grams work the same way, too.
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
string = "I saw red roosters at Ted's farm."
|
98
|
+
string.extend(TextComb::StringExtensions)
|
99
|
+
string.ngrams(2, :stop_words => :English).to_a
|
100
|
+
|
101
|
+
# returns:
|
102
|
+
["saw red", "red roosters", "Ted's farm"]
|
103
|
+
```
|
104
|
+
|
105
|
+
### Make a TextComb::String
|
106
|
+
|
107
|
+
TextComb::String includes TextComb::StringExtensions, but delegates everything
|
108
|
+
else to its string. It's like mixing TextComb::StringExtensions into your
|
109
|
+
own string.
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
require 'text_comb'
|
113
|
+
|
114
|
+
littany = TextComb::String.new("I must not fear.")
|
115
|
+
littany.ngrams(3).to_a # ["I must not", "must not fear"]
|
116
|
+
```
|
117
|
+
|
118
|
+
Even handier, there's the TextComb.string method to save you some
|
119
|
+
finger-tapping.
|
120
|
+
|
121
|
+
```ruby
|
122
|
+
require 'text_comb'
|
123
|
+
littany = TextComb.string("I must not fear.")
|
124
|
+
littany.words.to_a # -> ["I", "must", "not", "fear."]
|
125
|
+
|
126
|
+
littany.guess_language.to_s # -> :English
|
127
|
+
```
|
128
|
+
|
129
|
+
### Future Plans
|
130
|
+
|
131
|
+
- code(TextComb.ngrams) currently yields whole strings - maybe split
|
132
|
+
them into Arrays of words.
|
data/Rakefile
ADDED
data/lib/text_comb.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'java'
|
2
|
+
require_relative '../vendor/cue.language.jar'
|
3
|
+
require_relative 'text_comb/string_extensions'
|
4
|
+
require_relative 'text_comb/string'
|
5
|
+
require_relative 'text_comb/iterator'
|
6
|
+
|
7
|
+
module TextComb
|
8
|
+
|
9
|
+
def self.words(string)
|
10
|
+
enumerate(cue.WordIterator.new(string))
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.sentences(string)
|
14
|
+
enumerate(cue.SentenceIterator.new(string))
|
15
|
+
end
|
16
|
+
|
17
|
+
# TextComb.ngrams(string, 3)
|
18
|
+
# TextComb.ngrams(string, 3, :locale => java.util.Locale.default)
|
19
|
+
# TextComb.ngrams(string, 3, :stop_words => :guess)
|
20
|
+
# TextComb.ngrams(string, 3, :stop_words => :English)
|
21
|
+
# TextComb.ngrams(string, 3, TextComb.guess_language(string))
|
22
|
+
def self.ngrams(string, n, options={})
|
23
|
+
|
24
|
+
locale = options[:locale] || java.util.Locale.default
|
25
|
+
|
26
|
+
stop_words_val = case options[:stop_words]
|
27
|
+
when :guess
|
28
|
+
guess_language(string)
|
29
|
+
when Symbol
|
30
|
+
stop_words(options[:stop_words])
|
31
|
+
when stop.StopWords
|
32
|
+
options[:stop_words]
|
33
|
+
when nil
|
34
|
+
nil
|
35
|
+
else
|
36
|
+
raise "Can't recognize the stop_words: #{options[:stop_words]}"
|
37
|
+
end
|
38
|
+
|
39
|
+
enumerate(cue.NGramIterator.new(n, string, locale, stop_words_val))
|
40
|
+
end
|
41
|
+
|
42
|
+
# TextComb.guess_language "How are you?"
|
43
|
+
def self.guess_language(string)
|
44
|
+
stop.StopWords.guess(string)
|
45
|
+
end
|
46
|
+
|
47
|
+
# TextComb.stop_words :English
|
48
|
+
# TextComb.stop_words :French
|
49
|
+
def self.stop_words(stopwords_symbol)
|
50
|
+
stop.StopWords.const_get(stopwords_symbol)
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# For convenience
|
55
|
+
def self.string(s)
|
56
|
+
TextComb::String.new(s)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
private
|
61
|
+
def self.cue
|
62
|
+
Java::CueLang
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.stop
|
66
|
+
Java::CueLangStop
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.enumerate(iterator)
|
70
|
+
Iterator.new(iterator)
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module TextComb
|
2
|
+
|
3
|
+
module StringExtensions
|
4
|
+
|
5
|
+
def words
|
6
|
+
TextComb.words(self.to_s)
|
7
|
+
end
|
8
|
+
|
9
|
+
def sentences
|
10
|
+
TextComb.sentences(self.to_s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def ngrams(n, options={})
|
14
|
+
TextComb.ngrams(self.to_s, n, options)
|
15
|
+
end
|
16
|
+
|
17
|
+
def guess_language
|
18
|
+
TextComb.guess_language(self.to_s)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'text_comb'
|
3
|
+
|
4
|
+
class TestJavaInterface < MiniTest::Unit::TestCase
|
5
|
+
|
6
|
+
LITTANY = "
|
7
|
+
I must not fear. Fear is the mind-killer. Fear is the little-death
|
8
|
+
that brings total obliteration. I will face my fear. I will permit
|
9
|
+
it to pass over me and through me. And when it has gone past I will
|
10
|
+
turn the inner eye to see its path. Where the fear has gone there
|
11
|
+
will be nothing. Only I will remain.".strip
|
12
|
+
|
13
|
+
def test_each_word
|
14
|
+
expected = %w[I must not fear]
|
15
|
+
|
16
|
+
assert_equal expected, TextComb.words("I must not fear. ").to_a
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_each_sentence
|
20
|
+
|
21
|
+
expected = [
|
22
|
+
"I must not fear. ",
|
23
|
+
"Fear is the mind-killer. ",
|
24
|
+
"Fear is the little-death that brings total obliteration. ",
|
25
|
+
"I will face my fear. ",
|
26
|
+
"I will permit it to pass over me and through me. ",
|
27
|
+
"And when it has gone past I will turn the inner eye to see its path. ",
|
28
|
+
"Where the fear has gone there will be nothing. ",
|
29
|
+
"Only I will remain."
|
30
|
+
]
|
31
|
+
|
32
|
+
assert_equal expected, TextComb.sentences(LITTANY).to_a
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_each_ngram
|
36
|
+
expected = ["I must", "must not", "not fear"]
|
37
|
+
|
38
|
+
assert_equal expected, TextComb.ngrams("I must not fear. ", 2).to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_each_ngram_with_stop_words
|
42
|
+
text = "Fear is the little-death that brings total obliteration."
|
43
|
+
expected = ["brings total obliteration"]
|
44
|
+
ngrams = TextComb.ngrams(text, 3, :stop_words => :English).to_a
|
45
|
+
|
46
|
+
assert_equal expected, ngrams
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'text_comb'
|
3
|
+
|
4
|
+
class TestTextCombString < MiniTest::Unit::TestCase
|
5
|
+
|
6
|
+
def test_can_call_normal_string_methods
|
7
|
+
plain_string = "I came. I saw. I hacked."
|
8
|
+
textcomb = TextComb.string(plain_string)
|
9
|
+
|
10
|
+
assert_equal plain_string.upcase, textcomb.upcase
|
11
|
+
assert_equal plain_string.reverse, textcomb.reverse
|
12
|
+
assert_equal plain_string.gsub(/i/i, "We"), textcomb.gsub(/i/i, "We")
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_can_call_each_word
|
16
|
+
textcomb = TextComb.string("I came. I saw. I hacked.")
|
17
|
+
expected = %w[I came I saw I hacked]
|
18
|
+
|
19
|
+
assert_equal expected, textcomb.words.to_a
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_can_call_each_sentence
|
23
|
+
textcomb = TextComb.string("I came. I saw. I hacked.")
|
24
|
+
expected = ["I came. ", "I saw. ", "I hacked."]
|
25
|
+
|
26
|
+
assert_equal expected, textcomb.sentences.to_a
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_can_call_each_ngram
|
30
|
+
textcomb = TextComb.string("Never wake a sleeping cat.")
|
31
|
+
expected = ["Never wake a", "wake a sleeping", "a sleeping cat"]
|
32
|
+
|
33
|
+
assert_equal expected, textcomb.ngrams(3).to_a
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_ngrams_with_stop_words
|
37
|
+
textcomb = TextComb.string("I saw red roosters at Willy's farm.")
|
38
|
+
expected = ["saw red", "red roosters", "Willy's farm"]
|
39
|
+
|
40
|
+
ngrams = textcomb.ngrams(2, :stop_words => :English).to_a
|
41
|
+
assert_equal expected, ngrams
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_can_guess_its_language
|
45
|
+
textcomb = TextComb.string("I ate all the peanuts, then threw them up.")
|
46
|
+
assert_equal TextComb.stop_words(:English), textcomb.guess_language
|
47
|
+
|
48
|
+
textcomb = TextComb.string("J'ai la moutarde dans ma moustache.")
|
49
|
+
assert_equal TextComb.stop_words(:French), textcomb.guess_language
|
50
|
+
end
|
51
|
+
end
|
data/text_comb.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative "lib/text_comb/version"
|
2
|
+
|
3
|
+
spec = Gem::Specification.new do |s|
|
4
|
+
s.name = 'text_comb'
|
5
|
+
s.version = TextComb::VERSION
|
6
|
+
|
7
|
+
s.summary = %{
|
8
|
+
Extract words, sentences, and n-grams from natural-language text.
|
9
|
+
}.strip
|
10
|
+
|
11
|
+
s.description = %{A Ruby wrapper for the cue.language java library.}
|
12
|
+
|
13
|
+
s.licenses = ['MIT']
|
14
|
+
|
15
|
+
s.files = Dir['lib/**/*.rb'] + Dir['test/**/*.rb'] + %w[
|
16
|
+
LICENSE
|
17
|
+
README.md
|
18
|
+
Rakefile
|
19
|
+
text_comb.gemspec
|
20
|
+
vendor/cue.language.jar
|
21
|
+
]
|
22
|
+
|
23
|
+
s.require_path = 'lib'
|
24
|
+
s.platform = 'java'
|
25
|
+
s.required_ruby_version = ">= 1.9.2"
|
26
|
+
|
27
|
+
s.author = "Dan Bernier"
|
28
|
+
s.email = "danbernier@gmail.com"
|
29
|
+
s.homepage = "https://github.com/danbernier/text_comb"
|
30
|
+
end
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_comb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Dan Bernier
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Ruby wrapper for the cue.language java library.
|
14
|
+
email: danbernier@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- LICENSE
|
20
|
+
- README.md
|
21
|
+
- Rakefile
|
22
|
+
- lib/text_comb.rb
|
23
|
+
- lib/text_comb/iterator.rb
|
24
|
+
- lib/text_comb/string.rb
|
25
|
+
- lib/text_comb/string_extensions.rb
|
26
|
+
- lib/text_comb/version.rb
|
27
|
+
- test/test_java_interface.rb
|
28
|
+
- test/test_textcomb_string.rb
|
29
|
+
- text_comb.gemspec
|
30
|
+
- vendor/cue.language.jar
|
31
|
+
homepage: https://github.com/danbernier/text_comb
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.9.2
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.4.5
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Extract words, sentences, and n-grams from natural-language text.
|
55
|
+
test_files: []
|