text_comb 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e4df656cd44bb7a284b5e98201fb6085e5e91cc0
4
+ data.tar.gz: 858677d69dfaa6d2c4f2f386e44a41dc7f1f0cbd
5
+ SHA512:
6
+ metadata.gz: 83d309b17b501242535177dc601317daca51dc972d65c07d721d5c341b2fe9e60852286a4370d337a82ba4dabbb4a5b8647e4acebdddaeda813cdb71d74e947a
7
+ data.tar.gz: a13e54b3c74d2ba01f3c1b806b4b7ca1ef5e8915fa3cb00ab36820d8a37d1011eb46ba762956297a39c974e09394c4b36b0fed86deb63c364db909583e17b1b7
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Dan Bernier
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,132 @@
1
+ # Integration Exercise: Java Library Wrapper
2
+
3
+ ## Exercise Summary
4
+
5
+ - You should create a gem using JRuby that wraps an existing Java library.
6
+ - Your gem should work with a Java library that doesn't already have
7
+ a good wrapper.
8
+ - You should make the API for your library look and feel like Ruby, not Java.
9
+
10
+ ## Wrapping cue.language
11
+
12
+ For this assignment, I'm wrapping the
13
+ [cue.language](https://github.com/vcl/cue.language) library, which
14
+ handles "counting strings, identifying languages, and removing stop
15
+ words."
16
+
17
+ ## How to Use It
18
+
19
+ ### It's All In the TextComb Module
20
+
21
+ There are methods for splitting up text:
22
+
23
+ ```ruby
24
+ require 'text_comb'
25
+
26
+ string = "He must be a little nuts. He is. I mean he just isn't well
27
+ screwed on is he?"
28
+
29
+ p TextComb.words(string).to_a
30
+
31
+ # prints:
32
+ ["He", "must", "be", "a", "little", "nuts", "He", "is", "I", "mean"...
33
+
34
+ TextComb.sentences(string).each do |sentence|
35
+ p sentence
36
+ end
37
+
38
+ # prints:
39
+ "He must be a little nuts. "
40
+ "He is. "
41
+ "I mean he just isn't well screwed on is he?"
42
+
43
+ TextComb.ngrams(string, 5).each do |ngram|
44
+ p ngram
45
+ end
46
+
47
+ # prints:
48
+ "He must be a little"
49
+ "must be a little nuts"
50
+ "I mean he just isn't"
51
+ ...
52
+ ```
53
+
54
+ TextComb can take a guess at the language, and use the appropriate stop-words:
55
+
56
+ ```ruby
57
+ string = "That's a great mustache, grandma!"
58
+ TextComb.ngrams(string, 2, :stop_words=>:guess).to_a
59
+
60
+ # returns:
61
+ ["great mustache", "mustache grandma"]
62
+ ```
63
+
64
+ If it picks wrong, and you know what you're dealing with, you can specify:
65
+
66
+ ```ruby
67
+ TextComb.ngrams(string, 3, :stop_words=>:Croatian).each do |ngram|
68
+ puts ngram
69
+ end
70
+ ```
71
+
72
+ If you're curious, TextComb will tell you how it guessed:
73
+
74
+ ```ruby
75
+ string = "J'ai la moutarde dans ma moustache."
76
+ TextComb.guess_language(string).to_s # "French"
77
+ ```
78
+
79
+ ### Mix-in TextComb::StringExtensions
80
+
81
+ Extend a string with TextComb::StringExtensions, and it'll have those
82
+ methods:
83
+
84
+ ```ruby
85
+ require 'text_comb'
86
+
87
+ motto = "I came. I saw. I hacked."
88
+ motto.extend(TextComb::StringExtensions)
89
+
90
+ motto.sentences.to_a # ["I came. ", "I saw. ", "I hacked."]
91
+ motto.words.to_a.uniq # ["I", "came", "saw", "hacked"]
92
+ ```
93
+
94
+ Stop-words for n-grams work the same way, too.
95
+
96
+ ```ruby
97
+ string = "I saw red roosters at Ted's farm."
98
+ string.extend(TextComb::StringExtensions)
99
+ string.ngrams(2, :stop_words => :English).to_a
100
+
101
+ # returns:
102
+ ["saw red", "red roosters", "Ted's farm"]
103
+ ```
104
+
105
+ ### Make a TextComb::String
106
+
107
+ TextComb::String includes TextComb::StringExtensions, but delegates everything
108
+ else to its string. It's like mixing TextComb::StringExtensions into your
109
+ own string.
110
+
111
+ ```ruby
112
+ require 'text_comb'
113
+
114
+ littany = TextComb::String.new("I must not fear.")
115
+ littany.ngrams(3).to_a # ["I must not", "must not fear"]
116
+ ```
117
+
118
+ Even handier, there's the TextComb.string method to save you some
119
+ finger-tapping.
120
+
121
+ ```ruby
122
+ require 'text_comb'
123
+ littany = TextComb.string("I must not fear.")
124
+ littany.words.to_a # -> ["I", "must", "not", "fear."]
125
+
126
+ littany.guess_language.to_s # -> :English
127
+ ```
128
+
129
+ ### Future Plans
130
+
131
+ - code(TextComb.ngrams) currently yields whole strings - maybe split
132
+ them into Arrays of words.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new
4
+ task :default => :test
data/lib/text_comb.rb ADDED
@@ -0,0 +1,72 @@
1
+ require 'java'
2
+ require_relative '../vendor/cue.language.jar'
3
+ require_relative 'text_comb/string_extensions'
4
+ require_relative 'text_comb/string'
5
+ require_relative 'text_comb/iterator'
6
+
7
+ module TextComb
8
+
9
+ def self.words(string)
10
+ enumerate(cue.WordIterator.new(string))
11
+ end
12
+
13
+ def self.sentences(string)
14
+ enumerate(cue.SentenceIterator.new(string))
15
+ end
16
+
17
+ # TextComb.ngrams(string, 3)
18
+ # TextComb.ngrams(string, 3, :locale => java.util.Locale.default)
19
+ # TextComb.ngrams(string, 3, :stop_words => :guess)
20
+ # TextComb.ngrams(string, 3, :stop_words => :English)
21
+ # TextComb.ngrams(string, 3, TextComb.guess_language(string))
22
+ def self.ngrams(string, n, options={})
23
+
24
+ locale = options[:locale] || java.util.Locale.default
25
+
26
+ stop_words_val = case options[:stop_words]
27
+ when :guess
28
+ guess_language(string)
29
+ when Symbol
30
+ stop_words(options[:stop_words])
31
+ when stop.StopWords
32
+ options[:stop_words]
33
+ when nil
34
+ nil
35
+ else
36
+ raise "Can't recognize the stop_words: #{options[:stop_words]}"
37
+ end
38
+
39
+ enumerate(cue.NGramIterator.new(n, string, locale, stop_words_val))
40
+ end
41
+
42
+ # TextComb.guess_language "How are you?"
43
+ def self.guess_language(string)
44
+ stop.StopWords.guess(string)
45
+ end
46
+
47
+ # TextComb.stop_words :English
48
+ # TextComb.stop_words :French
49
+ def self.stop_words(stopwords_symbol)
50
+ stop.StopWords.const_get(stopwords_symbol)
51
+ end
52
+
53
+
54
+ # For convenience
55
+ def self.string(s)
56
+ TextComb::String.new(s)
57
+ end
58
+
59
+
60
+ private
61
+ def self.cue
62
+ Java::CueLang
63
+ end
64
+
65
+ def self.stop
66
+ Java::CueLangStop
67
+ end
68
+
69
+ def self.enumerate(iterator)
70
+ Iterator.new(iterator)
71
+ end
72
+ end
@@ -0,0 +1,18 @@
1
+ module TextComb
2
+
3
+ class Iterator
4
+ include Enumerable
5
+
6
+ def initialize(java_iter)
7
+ @java_iter = java_iter
8
+ end
9
+
10
+ def each
11
+ while @java_iter.has_next
12
+ yield @java_iter.next
13
+ end
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,9 @@
1
+ require 'delegate'
2
+
3
+ module TextComb
4
+
5
+ class String < DelegateClass(::String)
6
+ include StringExtensions
7
+ end
8
+
9
+ end
@@ -0,0 +1,23 @@
1
+ module TextComb
2
+
3
+ module StringExtensions
4
+
5
+ def words
6
+ TextComb.words(self.to_s)
7
+ end
8
+
9
+ def sentences
10
+ TextComb.sentences(self.to_s)
11
+ end
12
+
13
+ def ngrams(n, options={})
14
+ TextComb.ngrams(self.to_s, n, options)
15
+ end
16
+
17
+ def guess_language
18
+ TextComb.guess_language(self.to_s)
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,3 @@
1
+ module TextComb
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,48 @@
1
+ require 'minitest/autorun'
2
+ require 'text_comb'
3
+
4
+ class TestJavaInterface < MiniTest::Unit::TestCase
5
+
6
+ LITTANY = "
7
+ I must not fear. Fear is the mind-killer. Fear is the little-death
8
+ that brings total obliteration. I will face my fear. I will permit
9
+ it to pass over me and through me. And when it has gone past I will
10
+ turn the inner eye to see its path. Where the fear has gone there
11
+ will be nothing. Only I will remain.".strip
12
+
13
+ def test_each_word
14
+ expected = %w[I must not fear]
15
+
16
+ assert_equal expected, TextComb.words("I must not fear. ").to_a
17
+ end
18
+
19
+ def test_each_sentence
20
+
21
+ expected = [
22
+ "I must not fear. ",
23
+ "Fear is the mind-killer. ",
24
+ "Fear is the little-death that brings total obliteration. ",
25
+ "I will face my fear. ",
26
+ "I will permit it to pass over me and through me. ",
27
+ "And when it has gone past I will turn the inner eye to see its path. ",
28
+ "Where the fear has gone there will be nothing. ",
29
+ "Only I will remain."
30
+ ]
31
+
32
+ assert_equal expected, TextComb.sentences(LITTANY).to_a
33
+ end
34
+
35
+ def test_each_ngram
36
+ expected = ["I must", "must not", "not fear"]
37
+
38
+ assert_equal expected, TextComb.ngrams("I must not fear. ", 2).to_a
39
+ end
40
+
41
+ def test_each_ngram_with_stop_words
42
+ text = "Fear is the little-death that brings total obliteration."
43
+ expected = ["brings total obliteration"]
44
+ ngrams = TextComb.ngrams(text, 3, :stop_words => :English).to_a
45
+
46
+ assert_equal expected, ngrams
47
+ end
48
+ end
@@ -0,0 +1,51 @@
1
+ require 'minitest/autorun'
2
+ require 'text_comb'
3
+
4
+ class TestTextCombString < MiniTest::Unit::TestCase
5
+
6
+ def test_can_call_normal_string_methods
7
+ plain_string = "I came. I saw. I hacked."
8
+ textcomb = TextComb.string(plain_string)
9
+
10
+ assert_equal plain_string.upcase, textcomb.upcase
11
+ assert_equal plain_string.reverse, textcomb.reverse
12
+ assert_equal plain_string.gsub(/i/i, "We"), textcomb.gsub(/i/i, "We")
13
+ end
14
+
15
+ def test_can_call_each_word
16
+ textcomb = TextComb.string("I came. I saw. I hacked.")
17
+ expected = %w[I came I saw I hacked]
18
+
19
+ assert_equal expected, textcomb.words.to_a
20
+ end
21
+
22
+ def test_can_call_each_sentence
23
+ textcomb = TextComb.string("I came. I saw. I hacked.")
24
+ expected = ["I came. ", "I saw. ", "I hacked."]
25
+
26
+ assert_equal expected, textcomb.sentences.to_a
27
+ end
28
+
29
+ def test_can_call_each_ngram
30
+ textcomb = TextComb.string("Never wake a sleeping cat.")
31
+ expected = ["Never wake a", "wake a sleeping", "a sleeping cat"]
32
+
33
+ assert_equal expected, textcomb.ngrams(3).to_a
34
+ end
35
+
36
+ def test_ngrams_with_stop_words
37
+ textcomb = TextComb.string("I saw red roosters at Willy's farm.")
38
+ expected = ["saw red", "red roosters", "Willy's farm"]
39
+
40
+ ngrams = textcomb.ngrams(2, :stop_words => :English).to_a
41
+ assert_equal expected, ngrams
42
+ end
43
+
44
+ def test_can_guess_its_language
45
+ textcomb = TextComb.string("I ate all the peanuts, then threw them up.")
46
+ assert_equal TextComb.stop_words(:English), textcomb.guess_language
47
+
48
+ textcomb = TextComb.string("J'ai la moutarde dans ma moustache.")
49
+ assert_equal TextComb.stop_words(:French), textcomb.guess_language
50
+ end
51
+ end
data/text_comb.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ require_relative "lib/text_comb/version"
2
+
3
+ spec = Gem::Specification.new do |s|
4
+ s.name = 'text_comb'
5
+ s.version = TextComb::VERSION
6
+
7
+ s.summary = %{
8
+ Extract words, sentences, and n-grams from natural-language text.
9
+ }.strip
10
+
11
+ s.description = %{A Ruby wrapper for the cue.language java library.}
12
+
13
+ s.licenses = ['MIT']
14
+
15
+ s.files = Dir['lib/**/*.rb'] + Dir['test/**/*.rb'] + %w[
16
+ LICENSE
17
+ README.md
18
+ Rakefile
19
+ text_comb.gemspec
20
+ vendor/cue.language.jar
21
+ ]
22
+
23
+ s.require_path = 'lib'
24
+ s.platform = 'java'
25
+ s.required_ruby_version = ">= 1.9.2"
26
+
27
+ s.author = "Dan Bernier"
28
+ s.email = "danbernier@gmail.com"
29
+ s.homepage = "https://github.com/danbernier/text_comb"
30
+ end
Binary file
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_comb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: java
6
+ authors:
7
+ - Dan Bernier
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Ruby wrapper for the cue.language java library.
14
+ email: danbernier@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - LICENSE
20
+ - README.md
21
+ - Rakefile
22
+ - lib/text_comb.rb
23
+ - lib/text_comb/iterator.rb
24
+ - lib/text_comb/string.rb
25
+ - lib/text_comb/string_extensions.rb
26
+ - lib/text_comb/version.rb
27
+ - test/test_java_interface.rb
28
+ - test/test_textcomb_string.rb
29
+ - text_comb.gemspec
30
+ - vendor/cue.language.jar
31
+ homepage: https://github.com/danbernier/text_comb
32
+ licenses:
33
+ - MIT
34
+ metadata: {}
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - '>='
42
+ - !ruby/object:Gem::Version
43
+ version: 1.9.2
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 2.4.5
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: Extract words, sentences, and n-grams from natural-language text.
55
+ test_files: []