text_comb 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e4df656cd44bb7a284b5e98201fb6085e5e91cc0
4
+ data.tar.gz: 858677d69dfaa6d2c4f2f386e44a41dc7f1f0cbd
5
+ SHA512:
6
+ metadata.gz: 83d309b17b501242535177dc601317daca51dc972d65c07d721d5c341b2fe9e60852286a4370d337a82ba4dabbb4a5b8647e4acebdddaeda813cdb71d74e947a
7
+ data.tar.gz: a13e54b3c74d2ba01f3c1b806b4b7ca1ef5e8915fa3cb00ab36820d8a37d1011eb46ba762956297a39c974e09394c4b36b0fed86deb63c364db909583e17b1b7
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Dan Bernier
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,132 @@
1
+ # Integration Exercise: Java Library Wrapper
2
+
3
+ ## Exercise Summary
4
+
5
+ - You should create a gem using JRuby that wraps an existing Java library.
6
+ - Your gem should work with a Java library that doesn't already have
7
+ a good wrapper.
8
+ - You should make the API for your library look and feel like Ruby, not Java.
9
+
10
+ ## Wrapping cue.language
11
+
12
+ For this assignment, I'm wrapping the
13
+ [cue.language](https://github.com/vcl/cue.language) library, which
14
+ handles "counting strings, identifying languages, and removing stop
15
+ words."
16
+
17
+ ## How to Use It
18
+
19
+ ### It's All In the TextComb Module
20
+
21
+ There are methods for splitting up text:
22
+
23
+ ```ruby
24
+ require 'text_comb'
25
+
26
+ string = "He must be a little nuts. He is. I mean he just isn't well
27
+ screwed on is he?"
28
+
29
+ p TextComb.words(string).to_a
30
+
31
+ # prints:
32
+ ["He", "must", "be", "a", "little", "nuts", "He", "is", "I", "mean"...
33
+
34
+ TextComb.sentences(string).each do |sentence|
35
+ p sentence
36
+ end
37
+
38
+ # prints:
39
+ "He must be a little nuts. "
40
+ "He is. "
41
+ "I mean he just isn't well screwed on is he?"
42
+
43
+ TextComb.ngrams(string, 5).each do |ngram|
44
+ p ngram
45
+ end
46
+
47
+ # prints:
48
+ "He must be a little"
49
+ "must be a little nuts"
50
+ "I mean he just isn't"
51
+ ...
52
+ ```
53
+
54
+ TextComb can take a guess at the language, and use the appropriate stop-words:
55
+
56
+ ```ruby
57
+ string = "That's a great mustache, grandma!"
58
+ TextComb.ngrams(string, 2, :stop_words=>:guess).to_a
59
+
60
+ # returns:
61
+ ["great mustache", "mustache grandma"]
62
+ ```
63
+
64
+ If it picks wrong, and you know what you're dealing with, you can specify:
65
+
66
+ ```ruby
67
+ TextComb.ngrams(string, 3, :stop_words=>:Croatian).each do |ngram|
68
+ puts ngram
69
+ end
70
+ ```
71
+
72
+ If you're curious, TextComb will tell you how it guessed:
73
+
74
+ ```ruby
75
+ string = "J'ai la moutarde dans ma moustache."
76
+ TextComb.guess_language(string).to_s # "French"
77
+ ```
78
+
79
+ ### Mix-in TextComb::StringExtensions
80
+
81
+ Extend a string with TextComb::StringExtensions, and it'll have those
82
+ methods:
83
+
84
+ ```ruby
85
+ require 'text_comb'
86
+
87
+ motto = "I came. I saw. I hacked."
88
+ motto.extend(TextComb::StringExtensions)
89
+
90
+ motto.sentences.to_a # ["I came. ", "I saw. ", "I hacked."]
91
+ motto.words.to_a.uniq # ["I", "came", "saw", "hacked"]
92
+ ```
93
+
94
+ Stop-words for n-grams work the same way, too.
95
+
96
+ ```ruby
97
+ string = "I saw red roosters at Ted's farm."
98
+ string.extend(TextComb::StringExtensions)
99
+ string.ngrams(2, :stop_words => :English).to_a
100
+
101
+ # returns:
102
+ ["saw red", "red roosters", "Ted's farm"]
103
+ ```
104
+
105
+ ### Make a TextComb::String
106
+
107
+ TextComb::String includes TextComb::StringExtensions, but delegates everything
108
+ else to its string. It's like mixing TextComb::StringExtensions into your
109
+ own string.
110
+
111
+ ```ruby
112
+ require 'text_comb'
113
+
114
+ littany = TextComb::String.new("I must not fear.")
115
+ littany.ngrams(3).to_a # ["I must not", "must not fear"]
116
+ ```
117
+
118
+ Even handier, there's the TextComb.string method to save you some
119
+ finger-tapping.
120
+
121
+ ```ruby
122
+ require 'text_comb'
123
+ littany = TextComb.string("I must not fear.")
124
+ littany.words.to_a # -> ["I", "must", "not", "fear."]
125
+
126
+ littany.guess_language.to_s # -> :English
127
+ ```
128
+
129
+ ### Future Plans
130
+
131
+ - code(TextComb.ngrams) currently yields whole strings - maybe split
132
+ them into Arrays of words.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new
4
+ task :default => :test
data/lib/text_comb.rb ADDED
@@ -0,0 +1,72 @@
1
+ require 'java'
2
+ require_relative '../vendor/cue.language.jar'
3
+ require_relative 'text_comb/string_extensions'
4
+ require_relative 'text_comb/string'
5
+ require_relative 'text_comb/iterator'
6
+
7
+ module TextComb
8
+
9
+ def self.words(string)
10
+ enumerate(cue.WordIterator.new(string))
11
+ end
12
+
13
+ def self.sentences(string)
14
+ enumerate(cue.SentenceIterator.new(string))
15
+ end
16
+
17
+ # TextComb.ngrams(string, 3)
18
+ # TextComb.ngrams(string, 3, :locale => java.util.Locale.default)
19
+ # TextComb.ngrams(string, 3, :stop_words => :guess)
20
+ # TextComb.ngrams(string, 3, :stop_words => :English)
21
+ # TextComb.ngrams(string, 3, TextComb.guess_language(string))
22
+ def self.ngrams(string, n, options={})
23
+
24
+ locale = options[:locale] || java.util.Locale.default
25
+
26
+ stop_words_val = case options[:stop_words]
27
+ when :guess
28
+ guess_language(string)
29
+ when Symbol
30
+ stop_words(options[:stop_words])
31
+ when stop.StopWords
32
+ options[:stop_words]
33
+ when nil
34
+ nil
35
+ else
36
+ raise "Can't recognize the stop_words: #{options[:stop_words]}"
37
+ end
38
+
39
+ enumerate(cue.NGramIterator.new(n, string, locale, stop_words_val))
40
+ end
41
+
42
+ # TextComb.guess_language "How are you?"
43
+ def self.guess_language(string)
44
+ stop.StopWords.guess(string)
45
+ end
46
+
47
+ # TextComb.stop_words :English
48
+ # TextComb.stop_words :French
49
+ def self.stop_words(stopwords_symbol)
50
+ stop.StopWords.const_get(stopwords_symbol)
51
+ end
52
+
53
+
54
+ # For convenience
55
+ def self.string(s)
56
+ TextComb::String.new(s)
57
+ end
58
+
59
+
60
+ private
61
+ def self.cue
62
+ Java::CueLang
63
+ end
64
+
65
+ def self.stop
66
+ Java::CueLangStop
67
+ end
68
+
69
+ def self.enumerate(iterator)
70
+ Iterator.new(iterator)
71
+ end
72
+ end
@@ -0,0 +1,18 @@
1
+ module TextComb
2
+
3
+ class Iterator
4
+ include Enumerable
5
+
6
+ def initialize(java_iter)
7
+ @java_iter = java_iter
8
+ end
9
+
10
+ def each
11
+ while @java_iter.has_next
12
+ yield @java_iter.next
13
+ end
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,9 @@
1
+ require 'delegate'
2
+
3
+ module TextComb
4
+
5
+ class String < DelegateClass(::String)
6
+ include StringExtensions
7
+ end
8
+
9
+ end
@@ -0,0 +1,23 @@
1
+ module TextComb
2
+
3
+ module StringExtensions
4
+
5
+ def words
6
+ TextComb.words(self.to_s)
7
+ end
8
+
9
+ def sentences
10
+ TextComb.sentences(self.to_s)
11
+ end
12
+
13
+ def ngrams(n, options={})
14
+ TextComb.ngrams(self.to_s, n, options)
15
+ end
16
+
17
+ def guess_language
18
+ TextComb.guess_language(self.to_s)
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,3 @@
1
+ module TextComb
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,48 @@
1
+ require 'minitest/autorun'
2
+ require 'text_comb'
3
+
4
+ class TestJavaInterface < MiniTest::Unit::TestCase
5
+
6
+ LITTANY = "
7
+ I must not fear. Fear is the mind-killer. Fear is the little-death
8
+ that brings total obliteration. I will face my fear. I will permit
9
+ it to pass over me and through me. And when it has gone past I will
10
+ turn the inner eye to see its path. Where the fear has gone there
11
+ will be nothing. Only I will remain.".strip
12
+
13
+ def test_each_word
14
+ expected = %w[I must not fear]
15
+
16
+ assert_equal expected, TextComb.words("I must not fear. ").to_a
17
+ end
18
+
19
+ def test_each_sentence
20
+
21
+ expected = [
22
+ "I must not fear. ",
23
+ "Fear is the mind-killer. ",
24
+ "Fear is the little-death that brings total obliteration. ",
25
+ "I will face my fear. ",
26
+ "I will permit it to pass over me and through me. ",
27
+ "And when it has gone past I will turn the inner eye to see its path. ",
28
+ "Where the fear has gone there will be nothing. ",
29
+ "Only I will remain."
30
+ ]
31
+
32
+ assert_equal expected, TextComb.sentences(LITTANY).to_a
33
+ end
34
+
35
+ def test_each_ngram
36
+ expected = ["I must", "must not", "not fear"]
37
+
38
+ assert_equal expected, TextComb.ngrams("I must not fear. ", 2).to_a
39
+ end
40
+
41
+ def test_each_ngram_with_stop_words
42
+ text = "Fear is the little-death that brings total obliteration."
43
+ expected = ["brings total obliteration"]
44
+ ngrams = TextComb.ngrams(text, 3, :stop_words => :English).to_a
45
+
46
+ assert_equal expected, ngrams
47
+ end
48
+ end
@@ -0,0 +1,51 @@
1
+ require 'minitest/autorun'
2
+ require 'text_comb'
3
+
4
+ class TestTextCombString < MiniTest::Unit::TestCase
5
+
6
+ def test_can_call_normal_string_methods
7
+ plain_string = "I came. I saw. I hacked."
8
+ textcomb = TextComb.string(plain_string)
9
+
10
+ assert_equal plain_string.upcase, textcomb.upcase
11
+ assert_equal plain_string.reverse, textcomb.reverse
12
+ assert_equal plain_string.gsub(/i/i, "We"), textcomb.gsub(/i/i, "We")
13
+ end
14
+
15
+ def test_can_call_each_word
16
+ textcomb = TextComb.string("I came. I saw. I hacked.")
17
+ expected = %w[I came I saw I hacked]
18
+
19
+ assert_equal expected, textcomb.words.to_a
20
+ end
21
+
22
+ def test_can_call_each_sentence
23
+ textcomb = TextComb.string("I came. I saw. I hacked.")
24
+ expected = ["I came. ", "I saw. ", "I hacked."]
25
+
26
+ assert_equal expected, textcomb.sentences.to_a
27
+ end
28
+
29
+ def test_can_call_each_ngram
30
+ textcomb = TextComb.string("Never wake a sleeping cat.")
31
+ expected = ["Never wake a", "wake a sleeping", "a sleeping cat"]
32
+
33
+ assert_equal expected, textcomb.ngrams(3).to_a
34
+ end
35
+
36
+ def test_ngrams_with_stop_words
37
+ textcomb = TextComb.string("I saw red roosters at Willy's farm.")
38
+ expected = ["saw red", "red roosters", "Willy's farm"]
39
+
40
+ ngrams = textcomb.ngrams(2, :stop_words => :English).to_a
41
+ assert_equal expected, ngrams
42
+ end
43
+
44
+ def test_can_guess_its_language
45
+ textcomb = TextComb.string("I ate all the peanuts, then threw them up.")
46
+ assert_equal TextComb.stop_words(:English), textcomb.guess_language
47
+
48
+ textcomb = TextComb.string("J'ai la moutarde dans ma moustache.")
49
+ assert_equal TextComb.stop_words(:French), textcomb.guess_language
50
+ end
51
+ end
data/text_comb.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ require_relative "lib/text_comb/version"
2
+
3
+ spec = Gem::Specification.new do |s|
4
+ s.name = 'text_comb'
5
+ s.version = TextComb::VERSION
6
+
7
+ s.summary = %{
8
+ Extract words, sentences, and n-grams from natural-language text.
9
+ }.strip
10
+
11
+ s.description = %{A Ruby wrapper for the cue.language java library.}
12
+
13
+ s.licenses = ['MIT']
14
+
15
+ s.files = Dir['lib/**/*.rb'] + Dir['test/**/*.rb'] + %w[
16
+ LICENSE
17
+ README.md
18
+ Rakefile
19
+ text_comb.gemspec
20
+ vendor/cue.language.jar
21
+ ]
22
+
23
+ s.require_path = 'lib'
24
+ s.platform = 'java'
25
+ s.required_ruby_version = ">= 1.9.2"
26
+
27
+ s.author = "Dan Bernier"
28
+ s.email = "danbernier@gmail.com"
29
+ s.homepage = "https://github.com/danbernier/text_comb"
30
+ end
Binary file
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_comb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: java
6
+ authors:
7
+ - Dan Bernier
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Ruby wrapper for the cue.language java library.
14
+ email: danbernier@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - LICENSE
20
+ - README.md
21
+ - Rakefile
22
+ - lib/text_comb.rb
23
+ - lib/text_comb/iterator.rb
24
+ - lib/text_comb/string.rb
25
+ - lib/text_comb/string_extensions.rb
26
+ - lib/text_comb/version.rb
27
+ - test/test_java_interface.rb
28
+ - test/test_textcomb_string.rb
29
+ - text_comb.gemspec
30
+ - vendor/cue.language.jar
31
+ homepage: https://github.com/danbernier/text_comb
32
+ licenses:
33
+ - MIT
34
+ metadata: {}
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - '>='
42
+ - !ruby/object:Gem::Version
43
+ version: 1.9.2
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 2.4.5
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: Extract words, sentences, and n-grams from natural-language text.
55
+ test_files: []