poignant 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0f005ec458887532e94fc32a9f3c130039567cb5
4
+ data.tar.gz: 890379dc4360900fcdee8bec8d9dec2c1549fe0d
5
+ SHA512:
6
+ metadata.gz: 2df2eff33931f6fefcbc18e183ddf346b2ae7bf9c7e0c5f5151023cd9e7cdda334427c033cb6fd34017fc4e3905683975dbf1d51b4c07006adf1a775ea7893e4
7
+ data.tar.gz: b11c5dcd3986163302bc51fc7c2989c82d9e6adff3e64ffc8945b9044be41755fff54a2a502d91c6eef066e68b342a16aa4dc4d3e1cf81f62be8438ea901d85e
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in poignant.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2013 Aaron Massey
2
+
3
+ Licensed under the Apache License, Version 2.0 (the 'License');
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an 'AS IS' BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # Poignant
2
+
3
+ Poignant is a toolkit for natural language processing in Ruby. It is named in
4
+ honor of [why the lucky stiff][1].
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'poignant'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install poignant
19
+
20
+ ## Usage
21
+
22
+ TODO: Write usage instructions here
23
+
24
+ ## Contributing
25
+
26
+ 1. Fork it
27
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
28
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
29
+ 4. Push to the branch (`git push origin my-new-feature`)
30
+ 5. Create new Pull Request
31
+
32
+
33
+ [1]: http://en.wikipedia.org/wiki/Why_the_lucky_stiff
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,35 @@
1
+ require "poignant/frequency_distribution"
2
+ require "poignant/ngrams"
3
+ require "poignant/word_operations"
4
+ require "poignant/token_operations"
5
+
6
+ module Poignant
7
+ class Corpus
8
+ include Poignant::FrequencyDistribution
9
+ include Poignant::NGrams
10
+ include Poignant::WordOperations
11
+ include Poignant::TokenOperations
12
+
13
+ attr_reader :collection
14
+ # this is kind of a hack; we want to remove this eventually
15
+ attr_reader :raw
16
+
17
+ def initialize(document_array)
18
+ @collection = document_array
19
+ prepare_raw
20
+ end
21
+
22
+ def add(document)
23
+ @collection << document
24
+ prepare_raw
25
+ end
26
+
27
+ def prepare_raw
28
+ @raw = ""
29
+ @collection.each do |c|
30
+ @raw += c.raw
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,18 @@
1
+ require "poignant/word_operations"
2
+ require "poignant/frequency_distribution"
3
+ require "poignant/token_operations"
4
+
5
+ module Poignant
6
+ class Document
7
+ include Poignant::WordOperations
8
+ include Poignant::FrequencyDistribution
9
+ include Poignant::TokenOperations
10
+
11
+ attr_reader :raw
12
+
13
+ def initialize(string)
14
+ @raw = string
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,23 @@
1
+ module Poignant
2
+ module FrequencyDistribution
3
+
4
+ def frequency_distribution(tokens=false)
5
+ fd = {}
6
+ if tokens
7
+ unique_tokens.each do |token|
8
+ fd[token] = raw.scan(token).count
9
+ end
10
+ else
11
+ unique_words.each do |word|
12
+ fd[word] = normalized.scan(word).count
13
+ end
14
+ end
15
+ fd
16
+ end
17
+
18
+ def lexical_diversity
19
+ word_count * 1.0 / unique_words.count
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,19 @@
1
+ module Poignant
2
+ module NGrams
3
+ def bigrams
4
+ ngrams(2)
5
+ end
6
+
7
+ def trigrams
8
+ ngrams(3)
9
+ end
10
+
11
+ def ngrams(num)
12
+ ngrams = []
13
+ for i in num-1..(words.length-1)
14
+ ngrams << words[i-(num-1)..i]
15
+ end
16
+ ngrams
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,8 @@
1
+ module Poignant
2
+ module SimpleTokenizer
3
+ def self.tokenize(string)
4
+ string.split
5
+ end
6
+ end
7
+ end
8
+
@@ -0,0 +1,17 @@
1
+ module Poignant
2
+ module TokenOperations
3
+ def tokens
4
+ @raw.split
5
+ end
6
+
7
+ def unique_tokens
8
+ tokens.uniq
9
+ end
10
+
11
+ def token_count
12
+ tokens.count
13
+ end
14
+ end
15
+ end
16
+
17
+
@@ -0,0 +1,11 @@
1
+ module Poignant
2
+ module TokenizerInterface
3
+ def tokenize(string)
4
+ raise NotImplementedError, "Implementing 'tokenize' is required for this class."
5
+ end
6
+
7
+ def span_tokenize(string)
8
+ raise NotImplementedError, "Implementing 'span_tokenize' is required for this class."
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module Poignant
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ module Poignant
2
+ module WordOperations
3
+ def words
4
+ normalized.split
5
+ end
6
+
7
+ def unique_words
8
+ words.uniq
9
+ end
10
+
11
+ def word_count
12
+ words.count
13
+ end
14
+
15
+ def normalized
16
+ @raw.downcase.gsub(/[^a-z ]/, '')
17
+ end
18
+ end
19
+ end
20
+
data/lib/poignant.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "poignant/version"
2
+ require "poignant/corpus"
3
+ require "poignant/document"
4
+ require "poignant/simple_tokenizer"
5
+
6
+ module Poignant
7
+ # Your code goes here...
8
+ end
data/poignant.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'poignant/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "poignant"
8
+ gem.version = Poignant::VERSION
9
+ gem.authors = ["Aaron Massey"]
10
+ gem.email = ["akmassey@sixlines.org"]
11
+ gem.description = %q{Poignant is a toolkit for natural language processing in Ruby.}
12
+ gem.summary = %q{Poignant is a toolkit for natural language processing in Ruby. It is named in honor of why the lucky stiff.}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'rspec'
21
+ gem.add_development_dependency 'rake'
22
+ end
@@ -0,0 +1,60 @@
1
+ require 'poignant'
2
+
3
+ module Poignant
4
+
5
+ describe Corpus do
6
+ before(:each) do
7
+ @small = "This is a small sentence."
8
+ @repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
9
+ @punctuation = "This is a small sentence. It includes? Punctuation!"
10
+
11
+ @document_small = Document.new(@small)
12
+ @document_repeating = Document.new(@repeating)
13
+ end
14
+
15
+ it "should be initialized with an array of documents" do
16
+ document = Document.new(@small)
17
+ another = Document.new(@repeating)
18
+ lambda { Corpus.new([document, another]) }.should_not raise_error
19
+ end
20
+
21
+ it "should be able to add a document to the collection" do
22
+ first = Document.new(@small)
23
+ corpus = Corpus.new([first])
24
+ lambda { corpus.add(Document.new(@repeating)) }.should_not raise_error
25
+ end
26
+
27
+ it "should be able to calculate a lexical diversity" do
28
+ corpus = Corpus.new([@document_small])
29
+ corpus.lexical_diversity.should eq 1.0
30
+ another = Corpus.new([@document_repeating])
31
+ another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
32
+ end
33
+
34
+ # it "should be able to calculate a frequency distribution" do
35
+ # corpus = Corpus.new(@repeating)
36
+ # corpus.frequency_distribution["repeated"].should eq 2
37
+ # corpus.frequency_distribution["small"].should eq 2
38
+ # corpus.frequency_distribution["this"].should eq 2
39
+ # corpus.frequency_distribution["are"].should eq 1
40
+ # end
41
+
42
+ # it "should be able to calculate a frequency distribution for tokens" do
43
+ # corpus = Corpus.new(@repeating)
44
+ # corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
45
+ # corpus.frequency_distribution(tokens=true)["words."].should eq 1
46
+ # end
47
+
48
+ # it "should be able to remove punctuation" do
49
+ # Corpus.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
50
+ # end
51
+
52
+ # it "should be able to calculate ngrams" do
53
+ # bigrams = Corpus.new(@small).ngrams(2)
54
+ # bigrams.should_not be_nil
55
+ # bigrams[0].should eq ["this", "is"]
56
+ # bigrams[1].should eq ["is", "a"]
57
+ # bigrams[2].should eq ["a", "small"]
58
+ # end
59
+ end
60
+ end
@@ -0,0 +1,55 @@
1
+ require 'poignant'
2
+
3
+ module Poignant
4
+
5
+ describe Document do
6
+ before(:each) do
7
+ @small = "This is a small sentence."
8
+ @repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
9
+ @punctuation = "This is a small sentence. It includes? Punctuation!"
10
+ end
11
+
12
+ it "should be initialized with a string" do
13
+ lambda { Document.new(@small) }.should_not raise_error
14
+ end
15
+
16
+ it "should be able to calculate a lexical diversity" do
17
+ corpus = Document.new(@small)
18
+ corpus.lexical_diversity.should eq 1.0
19
+ another = Document.new(@repeating)
20
+ another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
21
+ end
22
+
23
+ it "should be able to calculate unique tokens" do
24
+ document = Document.new(@repeating)
25
+ document.unique_tokens.count.should eq 13
26
+ end
27
+
28
+ it "should be able to calculate a frequency distribution" do
29
+ corpus = Document.new(@repeating)
30
+ corpus.frequency_distribution["repeated"].should eq 2
31
+ corpus.frequency_distribution["small"].should eq 2
32
+ corpus.frequency_distribution["this"].should eq 2
33
+ corpus.frequency_distribution["are"].should eq 1
34
+ end
35
+
36
+ it "should be able to calculate a frequency distribution for tokens" do
37
+ corpus = Document.new(@repeating)
38
+ corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
39
+ corpus.frequency_distribution(tokens=true)["words."].should eq 1
40
+ end
41
+
42
+ it "should be able to remove punctuation" do
43
+ Document.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
44
+ end
45
+
46
+ # it "should be able to calculate ngrams" do
47
+ # bigrams = Document.new(@small).ngrams(2)
48
+ # bigrams.should_not be_nil
49
+ # bigrams[0].should eq ["this", "is"]
50
+ # bigrams[1].should eq ["is", "a"]
51
+ # bigrams[2].should eq ["a", "small"]
52
+ # end
53
+ end
54
+ end
55
+
@@ -0,0 +1,15 @@
1
+ require 'poignant'
2
+
3
+ module Poignant
4
+
5
+ describe SimpleTokenizer do
6
+
7
+ it "should be able to tokenize based on spaces" do
8
+ text = "This is a sentence."
9
+ result = ["This", "is", "a", "sentence."]
10
+ SimpleTokenizer.tokenize(text).should eq result
11
+ end
12
+
13
+ end
14
+
15
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: poignant
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Massey
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Poignant is a toolkit for natural language processing in Ruby.
42
+ email:
43
+ - akmassey@sixlines.org
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - lib/poignant.rb
54
+ - lib/poignant/corpus.rb
55
+ - lib/poignant/document.rb
56
+ - lib/poignant/frequency_distribution.rb
57
+ - lib/poignant/ngrams.rb
58
+ - lib/poignant/simple_tokenizer.rb
59
+ - lib/poignant/token_operations.rb
60
+ - lib/poignant/tokenizer_interface.rb
61
+ - lib/poignant/version.rb
62
+ - lib/poignant/word_operations.rb
63
+ - poignant.gemspec
64
+ - spec/corpus_spec.rb
65
+ - spec/document_spec.rb
66
+ - spec/tokenizers_spec.rb
67
+ homepage: ''
68
+ licenses: []
69
+ metadata: {}
70
+ post_install_message:
71
+ rdoc_options: []
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 2.0.3
87
+ signing_key:
88
+ specification_version: 4
89
+ summary: Poignant is a toolkit for natural language processing in Ruby. It is named
90
+ in honor of why the lucky stiff.
91
+ test_files:
92
+ - spec/corpus_spec.rb
93
+ - spec/document_spec.rb
94
+ - spec/tokenizers_spec.rb