poignant 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +13 -0
- data/README.md +33 -0
- data/Rakefile +6 -0
- data/lib/poignant/corpus.rb +35 -0
- data/lib/poignant/document.rb +18 -0
- data/lib/poignant/frequency_distribution.rb +23 -0
- data/lib/poignant/ngrams.rb +19 -0
- data/lib/poignant/simple_tokenizer.rb +8 -0
- data/lib/poignant/token_operations.rb +17 -0
- data/lib/poignant/tokenizer_interface.rb +11 -0
- data/lib/poignant/version.rb +3 -0
- data/lib/poignant/word_operations.rb +20 -0
- data/lib/poignant.rb +8 -0
- data/poignant.gemspec +22 -0
- data/spec/corpus_spec.rb +60 -0
- data/spec/document_spec.rb +55 -0
- data/spec/tokenizers_spec.rb +15 -0
- metadata +94 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0f005ec458887532e94fc32a9f3c130039567cb5
|
4
|
+
data.tar.gz: 890379dc4360900fcdee8bec8d9dec2c1549fe0d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2df2eff33931f6fefcbc18e183ddf346b2ae7bf9c7e0c5f5151023cd9e7cdda334427c033cb6fd34017fc4e3905683975dbf1d51b4c07006adf1a775ea7893e4
|
7
|
+
data.tar.gz: b11c5dcd3986163302bc51fc7c2989c82d9e6adff3e64ffc8945b9044be41755fff54a2a502d91c6eef066e68b342a16aa4dc4d3e1cf81f62be8438ea901d85e
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright (c) 2013 Aaron Massey
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the 'License');
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an 'AS IS' BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Poignant
|
2
|
+
|
3
|
+
Poignant is a toolkit for natural language processing in Ruby. It is named in
|
4
|
+
honor of [why the lucky stiff][1].
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'poignant'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install poignant
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
TODO: Write usage instructions here
|
23
|
+
|
24
|
+
## Contributing
|
25
|
+
|
26
|
+
1. Fork it
|
27
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
28
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
29
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
30
|
+
5. Create new Pull Request
|
31
|
+
|
32
|
+
|
33
|
+
[1]: http://en.wikipedia.org/wiki/Why_the_lucky_stiff
|
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require "poignant/frequency_distribution"
|
2
|
+
require "poignant/ngrams"
|
3
|
+
require "poignant/word_operations"
|
4
|
+
require "poignant/token_operations"
|
5
|
+
|
6
|
+
module Poignant
|
7
|
+
class Corpus
|
8
|
+
include Poignant::FrequencyDistribution
|
9
|
+
include Poignant::NGrams
|
10
|
+
include Poignant::WordOperations
|
11
|
+
include Poignant::TokenOperations
|
12
|
+
|
13
|
+
attr_reader :collection
|
14
|
+
# this is kind of a hack; we want to remove this eventually
|
15
|
+
attr_reader :raw
|
16
|
+
|
17
|
+
def initialize(document_array)
|
18
|
+
@collection = document_array
|
19
|
+
prepare_raw
|
20
|
+
end
|
21
|
+
|
22
|
+
def add(document)
|
23
|
+
@collection << document
|
24
|
+
prepare_raw
|
25
|
+
end
|
26
|
+
|
27
|
+
def prepare_raw
|
28
|
+
@raw = ""
|
29
|
+
@collection.each do |c|
|
30
|
+
@raw += c.raw
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "poignant/word_operations"
|
2
|
+
require "poignant/frequency_distribution"
|
3
|
+
require "poignant/token_operations"
|
4
|
+
|
5
|
+
module Poignant
|
6
|
+
class Document
|
7
|
+
include Poignant::WordOperations
|
8
|
+
include Poignant::FrequencyDistribution
|
9
|
+
include Poignant::TokenOperations
|
10
|
+
|
11
|
+
attr_reader :raw
|
12
|
+
|
13
|
+
def initialize(string)
|
14
|
+
@raw = string
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Poignant
|
2
|
+
module FrequencyDistribution
|
3
|
+
|
4
|
+
def frequency_distribution(tokens=false)
|
5
|
+
fd = {}
|
6
|
+
if tokens
|
7
|
+
unique_tokens.each do |token|
|
8
|
+
fd[token] = raw.scan(token).count
|
9
|
+
end
|
10
|
+
else
|
11
|
+
unique_words.each do |word|
|
12
|
+
fd[word] = normalized.scan(word).count
|
13
|
+
end
|
14
|
+
end
|
15
|
+
fd
|
16
|
+
end
|
17
|
+
|
18
|
+
def lexical_diversity
|
19
|
+
word_count * 1.0 / unique_words.count
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Poignant
|
2
|
+
module TokenizerInterface
|
3
|
+
def tokenize(string)
|
4
|
+
raise NotImplementedError, "Implementing 'tokenize' is required for this class."
|
5
|
+
end
|
6
|
+
|
7
|
+
def span_tokenize(string)
|
8
|
+
raise NotImplementedError, "Implementing 'span_tokenize' is required for this class."
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Poignant
|
2
|
+
module WordOperations
|
3
|
+
def words
|
4
|
+
normalized.split
|
5
|
+
end
|
6
|
+
|
7
|
+
def unique_words
|
8
|
+
words.uniq
|
9
|
+
end
|
10
|
+
|
11
|
+
def word_count
|
12
|
+
words.count
|
13
|
+
end
|
14
|
+
|
15
|
+
def normalized
|
16
|
+
@raw.downcase.gsub(/[^a-z ]/, '')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
data/lib/poignant.rb
ADDED
data/poignant.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'poignant/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "poignant"
|
8
|
+
gem.version = Poignant::VERSION
|
9
|
+
gem.authors = ["Aaron Massey"]
|
10
|
+
gem.email = ["akmassey@sixlines.org"]
|
11
|
+
gem.description = %q{Poignant is a toolkit for natural language processing in Ruby.}
|
12
|
+
gem.summary = %q{Poignant is a toolkit for natural language processing in Ruby. It is named in honor of why the lucky stiff.}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rspec'
|
21
|
+
gem.add_development_dependency 'rake'
|
22
|
+
end
|
data/spec/corpus_spec.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'poignant'
|
2
|
+
|
3
|
+
module Poignant
|
4
|
+
|
5
|
+
describe Corpus do
|
6
|
+
before(:each) do
|
7
|
+
@small = "This is a small sentence."
|
8
|
+
@repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
|
9
|
+
@punctuation = "This is a small sentence. It includes? Punctuation!"
|
10
|
+
|
11
|
+
@document_small = Document.new(@small)
|
12
|
+
@document_repeating = Document.new(@repeating)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be initialized with an array of documents" do
|
16
|
+
document = Document.new(@small)
|
17
|
+
another = Document.new(@repeating)
|
18
|
+
lambda { Corpus.new([document, another]) }.should_not raise_error
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should be able to add a document to the collection" do
|
22
|
+
first = Document.new(@small)
|
23
|
+
corpus = Corpus.new([first])
|
24
|
+
lambda { corpus.add(Document.new(@repeating)) }.should_not raise_error
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should be able to calculate a lexical diversity" do
|
28
|
+
corpus = Corpus.new([@document_small])
|
29
|
+
corpus.lexical_diversity.should eq 1.0
|
30
|
+
another = Corpus.new([@document_repeating])
|
31
|
+
another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
|
32
|
+
end
|
33
|
+
|
34
|
+
# it "should be able to calculate a frequency distribution" do
|
35
|
+
# corpus = Corpus.new(@repeating)
|
36
|
+
# corpus.frequency_distribution["repeated"].should eq 2
|
37
|
+
# corpus.frequency_distribution["small"].should eq 2
|
38
|
+
# corpus.frequency_distribution["this"].should eq 2
|
39
|
+
# corpus.frequency_distribution["are"].should eq 1
|
40
|
+
# end
|
41
|
+
|
42
|
+
# it "should be able to calculate a frequency distribution for tokens" do
|
43
|
+
# corpus = Corpus.new(@repeating)
|
44
|
+
# corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
|
45
|
+
# corpus.frequency_distribution(tokens=true)["words."].should eq 1
|
46
|
+
# end
|
47
|
+
|
48
|
+
# it "should be able to remove punctuation" do
|
49
|
+
# Corpus.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
|
50
|
+
# end
|
51
|
+
|
52
|
+
# it "should be able to calculate ngrams" do
|
53
|
+
# bigrams = Corpus.new(@small).ngrams(2)
|
54
|
+
# bigrams.should_not be_nil
|
55
|
+
# bigrams[0].should eq ["this", "is"]
|
56
|
+
# bigrams[1].should eq ["is", "a"]
|
57
|
+
# bigrams[2].should eq ["a", "small"]
|
58
|
+
# end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'poignant'
|
2
|
+
|
3
|
+
module Poignant
|
4
|
+
|
5
|
+
describe Document do
|
6
|
+
before(:each) do
|
7
|
+
@small = "This is a small sentence."
|
8
|
+
@repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
|
9
|
+
@punctuation = "This is a small sentence. It includes? Punctuation!"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should be initialized with a string" do
|
13
|
+
lambda { Document.new(@small) }.should_not raise_error
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should be able to calculate a lexical diversity" do
|
17
|
+
corpus = Document.new(@small)
|
18
|
+
corpus.lexical_diversity.should eq 1.0
|
19
|
+
another = Document.new(@repeating)
|
20
|
+
another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be able to calculate unique tokens" do
|
24
|
+
document = Document.new(@repeating)
|
25
|
+
document.unique_tokens.count.should eq 13
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should be able to calculate a frequency distribution" do
|
29
|
+
corpus = Document.new(@repeating)
|
30
|
+
corpus.frequency_distribution["repeated"].should eq 2
|
31
|
+
corpus.frequency_distribution["small"].should eq 2
|
32
|
+
corpus.frequency_distribution["this"].should eq 2
|
33
|
+
corpus.frequency_distribution["are"].should eq 1
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should be able to calculate a frequency distribution for tokens" do
|
37
|
+
corpus = Document.new(@repeating)
|
38
|
+
corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
|
39
|
+
corpus.frequency_distribution(tokens=true)["words."].should eq 1
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should be able to remove punctuation" do
|
43
|
+
Document.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
|
44
|
+
end
|
45
|
+
|
46
|
+
# it "should be able to calculate ngrams" do
|
47
|
+
# bigrams = Document.new(@small).ngrams(2)
|
48
|
+
# bigrams.should_not be_nil
|
49
|
+
# bigrams[0].should eq ["this", "is"]
|
50
|
+
# bigrams[1].should eq ["is", "a"]
|
51
|
+
# bigrams[2].should eq ["a", "small"]
|
52
|
+
# end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'poignant'
|
2
|
+
|
3
|
+
module Poignant
|
4
|
+
|
5
|
+
describe SimpleTokenizer do
|
6
|
+
|
7
|
+
it "should be able to tokenize based on spaces" do
|
8
|
+
text = "This is a sentence."
|
9
|
+
result = ["This", "is", "a", "sentence."]
|
10
|
+
SimpleTokenizer.tokenize(text).should eq result
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: poignant
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aaron Massey
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: Poignant is a toolkit for natural language processing in Ruby.
|
42
|
+
email:
|
43
|
+
- akmassey@sixlines.org
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- Gemfile
|
50
|
+
- LICENSE.txt
|
51
|
+
- README.md
|
52
|
+
- Rakefile
|
53
|
+
- lib/poignant.rb
|
54
|
+
- lib/poignant/corpus.rb
|
55
|
+
- lib/poignant/document.rb
|
56
|
+
- lib/poignant/frequency_distribution.rb
|
57
|
+
- lib/poignant/ngrams.rb
|
58
|
+
- lib/poignant/simple_tokenizer.rb
|
59
|
+
- lib/poignant/token_operations.rb
|
60
|
+
- lib/poignant/tokenizer_interface.rb
|
61
|
+
- lib/poignant/version.rb
|
62
|
+
- lib/poignant/word_operations.rb
|
63
|
+
- poignant.gemspec
|
64
|
+
- spec/corpus_spec.rb
|
65
|
+
- spec/document_spec.rb
|
66
|
+
- spec/tokenizers_spec.rb
|
67
|
+
homepage: ''
|
68
|
+
licenses: []
|
69
|
+
metadata: {}
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options: []
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
requirements: []
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 2.0.3
|
87
|
+
signing_key:
|
88
|
+
specification_version: 4
|
89
|
+
summary: Poignant is a toolkit for natural language processing in Ruby. It is named
|
90
|
+
in honor of why the lucky stiff.
|
91
|
+
test_files:
|
92
|
+
- spec/corpus_spec.rb
|
93
|
+
- spec/document_spec.rb
|
94
|
+
- spec/tokenizers_spec.rb
|