poignant 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +13 -0
- data/README.md +33 -0
- data/Rakefile +6 -0
- data/lib/poignant/corpus.rb +35 -0
- data/lib/poignant/document.rb +18 -0
- data/lib/poignant/frequency_distribution.rb +23 -0
- data/lib/poignant/ngrams.rb +19 -0
- data/lib/poignant/simple_tokenizer.rb +8 -0
- data/lib/poignant/token_operations.rb +17 -0
- data/lib/poignant/tokenizer_interface.rb +11 -0
- data/lib/poignant/version.rb +3 -0
- data/lib/poignant/word_operations.rb +20 -0
- data/lib/poignant.rb +8 -0
- data/poignant.gemspec +22 -0
- data/spec/corpus_spec.rb +60 -0
- data/spec/document_spec.rb +55 -0
- data/spec/tokenizers_spec.rb +15 -0
- metadata +94 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0f005ec458887532e94fc32a9f3c130039567cb5
|
4
|
+
data.tar.gz: 890379dc4360900fcdee8bec8d9dec2c1549fe0d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2df2eff33931f6fefcbc18e183ddf346b2ae7bf9c7e0c5f5151023cd9e7cdda334427c033cb6fd34017fc4e3905683975dbf1d51b4c07006adf1a775ea7893e4
|
7
|
+
data.tar.gz: b11c5dcd3986163302bc51fc7c2989c82d9e6adff3e64ffc8945b9044be41755fff54a2a502d91c6eef066e68b342a16aa4dc4d3e1cf81f62be8438ea901d85e
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright (c) 2013 Aaron Massey
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the 'License');
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an 'AS IS' BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Poignant
|
2
|
+
|
3
|
+
Poignant is a toolkit for natural language processing in Ruby. It is named in
|
4
|
+
honor of [why the lucky stiff][1].
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'poignant'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install poignant
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
TODO: Write usage instructions here
|
23
|
+
|
24
|
+
## Contributing
|
25
|
+
|
26
|
+
1. Fork it
|
27
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
28
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
29
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
30
|
+
5. Create new Pull Request
|
31
|
+
|
32
|
+
|
33
|
+
[1]: http://en.wikipedia.org/wiki/Why_the_lucky_stiff
|
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require "poignant/frequency_distribution"
|
2
|
+
require "poignant/ngrams"
|
3
|
+
require "poignant/word_operations"
|
4
|
+
require "poignant/token_operations"
|
5
|
+
|
6
|
+
module Poignant
|
7
|
+
class Corpus
|
8
|
+
include Poignant::FrequencyDistribution
|
9
|
+
include Poignant::NGrams
|
10
|
+
include Poignant::WordOperations
|
11
|
+
include Poignant::TokenOperations
|
12
|
+
|
13
|
+
attr_reader :collection
|
14
|
+
# this is kind of a hack; we want to remove this eventually
|
15
|
+
attr_reader :raw
|
16
|
+
|
17
|
+
def initialize(document_array)
|
18
|
+
@collection = document_array
|
19
|
+
prepare_raw
|
20
|
+
end
|
21
|
+
|
22
|
+
def add(document)
|
23
|
+
@collection << document
|
24
|
+
prepare_raw
|
25
|
+
end
|
26
|
+
|
27
|
+
def prepare_raw
|
28
|
+
@raw = ""
|
29
|
+
@collection.each do |c|
|
30
|
+
@raw += c.raw
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "poignant/word_operations"
|
2
|
+
require "poignant/frequency_distribution"
|
3
|
+
require "poignant/token_operations"
|
4
|
+
|
5
|
+
module Poignant
|
6
|
+
class Document
|
7
|
+
include Poignant::WordOperations
|
8
|
+
include Poignant::FrequencyDistribution
|
9
|
+
include Poignant::TokenOperations
|
10
|
+
|
11
|
+
attr_reader :raw
|
12
|
+
|
13
|
+
def initialize(string)
|
14
|
+
@raw = string
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Poignant
|
2
|
+
module FrequencyDistribution
|
3
|
+
|
4
|
+
def frequency_distribution(tokens=false)
|
5
|
+
fd = {}
|
6
|
+
if tokens
|
7
|
+
unique_tokens.each do |token|
|
8
|
+
fd[token] = raw.scan(token).count
|
9
|
+
end
|
10
|
+
else
|
11
|
+
unique_words.each do |word|
|
12
|
+
fd[word] = normalized.scan(word).count
|
13
|
+
end
|
14
|
+
end
|
15
|
+
fd
|
16
|
+
end
|
17
|
+
|
18
|
+
def lexical_diversity
|
19
|
+
word_count * 1.0 / unique_words.count
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Poignant
|
2
|
+
module TokenizerInterface
|
3
|
+
def tokenize(string)
|
4
|
+
raise NotImplementedError, "Implementing 'tokenize' is required for this class."
|
5
|
+
end
|
6
|
+
|
7
|
+
def span_tokenize(string)
|
8
|
+
raise NotImplementedError, "Implementing 'span_tokenize' is required for this class."
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Poignant
|
2
|
+
module WordOperations
|
3
|
+
def words
|
4
|
+
normalized.split
|
5
|
+
end
|
6
|
+
|
7
|
+
def unique_words
|
8
|
+
words.uniq
|
9
|
+
end
|
10
|
+
|
11
|
+
def word_count
|
12
|
+
words.count
|
13
|
+
end
|
14
|
+
|
15
|
+
def normalized
|
16
|
+
@raw.downcase.gsub(/[^a-z ]/, '')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
data/lib/poignant.rb
ADDED
data/poignant.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'poignant/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "poignant"
|
8
|
+
gem.version = Poignant::VERSION
|
9
|
+
gem.authors = ["Aaron Massey"]
|
10
|
+
gem.email = ["akmassey@sixlines.org"]
|
11
|
+
gem.description = %q{Poignant is a toolkit for natural language processing in Ruby.}
|
12
|
+
gem.summary = %q{Poignant is a toolkit for natural language processing in Ruby. It is named in honor of why the lucky stiff.}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rspec'
|
21
|
+
gem.add_development_dependency 'rake'
|
22
|
+
end
|
data/spec/corpus_spec.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'poignant'
|
2
|
+
|
3
|
+
module Poignant
|
4
|
+
|
5
|
+
describe Corpus do
|
6
|
+
before(:each) do
|
7
|
+
@small = "This is a small sentence."
|
8
|
+
@repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
|
9
|
+
@punctuation = "This is a small sentence. It includes? Punctuation!"
|
10
|
+
|
11
|
+
@document_small = Document.new(@small)
|
12
|
+
@document_repeating = Document.new(@repeating)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be initialized with an array of documents" do
|
16
|
+
document = Document.new(@small)
|
17
|
+
another = Document.new(@repeating)
|
18
|
+
lambda { Corpus.new([document, another]) }.should_not raise_error
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should be able to add a document to the collection" do
|
22
|
+
first = Document.new(@small)
|
23
|
+
corpus = Corpus.new([first])
|
24
|
+
lambda { corpus.add(Document.new(@repeating)) }.should_not raise_error
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should be able to calculate a lexical diversity" do
|
28
|
+
corpus = Corpus.new([@document_small])
|
29
|
+
corpus.lexical_diversity.should eq 1.0
|
30
|
+
another = Corpus.new([@document_repeating])
|
31
|
+
another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
|
32
|
+
end
|
33
|
+
|
34
|
+
# it "should be able to calculate a frequency distribution" do
|
35
|
+
# corpus = Corpus.new(@repeating)
|
36
|
+
# corpus.frequency_distribution["repeated"].should eq 2
|
37
|
+
# corpus.frequency_distribution["small"].should eq 2
|
38
|
+
# corpus.frequency_distribution["this"].should eq 2
|
39
|
+
# corpus.frequency_distribution["are"].should eq 1
|
40
|
+
# end
|
41
|
+
|
42
|
+
# it "should be able to calculate a frequency distribution for tokens" do
|
43
|
+
# corpus = Corpus.new(@repeating)
|
44
|
+
# corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
|
45
|
+
# corpus.frequency_distribution(tokens=true)["words."].should eq 1
|
46
|
+
# end
|
47
|
+
|
48
|
+
# it "should be able to remove punctuation" do
|
49
|
+
# Corpus.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
|
50
|
+
# end
|
51
|
+
|
52
|
+
# it "should be able to calculate ngrams" do
|
53
|
+
# bigrams = Corpus.new(@small).ngrams(2)
|
54
|
+
# bigrams.should_not be_nil
|
55
|
+
# bigrams[0].should eq ["this", "is"]
|
56
|
+
# bigrams[1].should eq ["is", "a"]
|
57
|
+
# bigrams[2].should eq ["a", "small"]
|
58
|
+
# end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'poignant'
|
2
|
+
|
3
|
+
module Poignant
|
4
|
+
|
5
|
+
describe Document do
|
6
|
+
before(:each) do
|
7
|
+
@small = "This is a small sentence."
|
8
|
+
@repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
|
9
|
+
@punctuation = "This is a small sentence. It includes? Punctuation!"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should be initialized with a string" do
|
13
|
+
lambda { Document.new(@small) }.should_not raise_error
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should be able to calculate a lexical diversity" do
|
17
|
+
corpus = Document.new(@small)
|
18
|
+
corpus.lexical_diversity.should eq 1.0
|
19
|
+
another = Document.new(@repeating)
|
20
|
+
another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be able to calculate unique tokens" do
|
24
|
+
document = Document.new(@repeating)
|
25
|
+
document.unique_tokens.count.should eq 13
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should be able to calculate a frequency distribution" do
|
29
|
+
corpus = Document.new(@repeating)
|
30
|
+
corpus.frequency_distribution["repeated"].should eq 2
|
31
|
+
corpus.frequency_distribution["small"].should eq 2
|
32
|
+
corpus.frequency_distribution["this"].should eq 2
|
33
|
+
corpus.frequency_distribution["are"].should eq 1
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should be able to calculate a frequency distribution for tokens" do
|
37
|
+
corpus = Document.new(@repeating)
|
38
|
+
corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
|
39
|
+
corpus.frequency_distribution(tokens=true)["words."].should eq 1
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should be able to remove punctuation" do
|
43
|
+
Document.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
|
44
|
+
end
|
45
|
+
|
46
|
+
# it "should be able to calculate ngrams" do
|
47
|
+
# bigrams = Document.new(@small).ngrams(2)
|
48
|
+
# bigrams.should_not be_nil
|
49
|
+
# bigrams[0].should eq ["this", "is"]
|
50
|
+
# bigrams[1].should eq ["is", "a"]
|
51
|
+
# bigrams[2].should eq ["a", "small"]
|
52
|
+
# end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'poignant'
|
2
|
+
|
3
|
+
module Poignant
|
4
|
+
|
5
|
+
describe SimpleTokenizer do
|
6
|
+
|
7
|
+
it "should be able to tokenize based on spaces" do
|
8
|
+
text = "This is a sentence."
|
9
|
+
result = ["This", "is", "a", "sentence."]
|
10
|
+
SimpleTokenizer.tokenize(text).should eq result
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: poignant
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aaron Massey
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: Poignant is a toolkit for natural language processing in Ruby.
|
42
|
+
email:
|
43
|
+
- akmassey@sixlines.org
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- Gemfile
|
50
|
+
- LICENSE.txt
|
51
|
+
- README.md
|
52
|
+
- Rakefile
|
53
|
+
- lib/poignant.rb
|
54
|
+
- lib/poignant/corpus.rb
|
55
|
+
- lib/poignant/document.rb
|
56
|
+
- lib/poignant/frequency_distribution.rb
|
57
|
+
- lib/poignant/ngrams.rb
|
58
|
+
- lib/poignant/simple_tokenizer.rb
|
59
|
+
- lib/poignant/token_operations.rb
|
60
|
+
- lib/poignant/tokenizer_interface.rb
|
61
|
+
- lib/poignant/version.rb
|
62
|
+
- lib/poignant/word_operations.rb
|
63
|
+
- poignant.gemspec
|
64
|
+
- spec/corpus_spec.rb
|
65
|
+
- spec/document_spec.rb
|
66
|
+
- spec/tokenizers_spec.rb
|
67
|
+
homepage: ''
|
68
|
+
licenses: []
|
69
|
+
metadata: {}
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options: []
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
requirements: []
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 2.0.3
|
87
|
+
signing_key:
|
88
|
+
specification_version: 4
|
89
|
+
summary: Poignant is a toolkit for natural language processing in Ruby. It is named
|
90
|
+
in honor of why the lucky stiff.
|
91
|
+
test_files:
|
92
|
+
- spec/corpus_spec.rb
|
93
|
+
- spec/document_spec.rb
|
94
|
+
- spec/tokenizers_spec.rb
|