poignant 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0f005ec458887532e94fc32a9f3c130039567cb5
4
+ data.tar.gz: 890379dc4360900fcdee8bec8d9dec2c1549fe0d
5
+ SHA512:
6
+ metadata.gz: 2df2eff33931f6fefcbc18e183ddf346b2ae7bf9c7e0c5f5151023cd9e7cdda334427c033cb6fd34017fc4e3905683975dbf1d51b4c07006adf1a775ea7893e4
7
+ data.tar.gz: b11c5dcd3986163302bc51fc7c2989c82d9e6adff3e64ffc8945b9044be41755fff54a2a502d91c6eef066e68b342a16aa4dc4d3e1cf81f62be8438ea901d85e
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in poignant.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2013 Aaron Massey
2
+
3
+ Licensed under the Apache License, Version 2.0 (the 'License');
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an 'AS IS' BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # Poignant
2
+
3
+ Poignant is a toolkit for natural language processing in Ruby. It is named in
4
+ honor of [why the lucky stiff][1].
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'poignant'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install poignant
19
+
20
+ ## Usage
21
+
22
+ TODO: Write usage instructions here
23
+
24
+ ## Contributing
25
+
26
+ 1. Fork it
27
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
28
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
29
+ 4. Push to the branch (`git push origin my-new-feature`)
30
+ 5. Create new Pull Request
31
+
32
+
33
+ [1]: http://en.wikipedia.org/wiki/Why_the_lucky_stiff
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,35 @@
1
+ require "poignant/frequency_distribution"
2
+ require "poignant/ngrams"
3
+ require "poignant/word_operations"
4
+ require "poignant/token_operations"
5
+
6
+ module Poignant
7
+ class Corpus
8
+ include Poignant::FrequencyDistribution
9
+ include Poignant::NGrams
10
+ include Poignant::WordOperations
11
+ include Poignant::TokenOperations
12
+
13
+ attr_reader :collection
14
+ # this is kind of a hack; we want to remove this eventually
15
+ attr_reader :raw
16
+
17
+ def initialize(document_array)
18
+ @collection = document_array
19
+ prepare_raw
20
+ end
21
+
22
+ def add(document)
23
+ @collection << document
24
+ prepare_raw
25
+ end
26
+
27
+ def prepare_raw
28
+ @raw = ""
29
+ @collection.each do |c|
30
+ @raw += c.raw
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,18 @@
1
+ require "poignant/word_operations"
2
+ require "poignant/frequency_distribution"
3
+ require "poignant/token_operations"
4
+
5
+ module Poignant
6
+ class Document
7
+ include Poignant::WordOperations
8
+ include Poignant::FrequencyDistribution
9
+ include Poignant::TokenOperations
10
+
11
+ attr_reader :raw
12
+
13
+ def initialize(string)
14
+ @raw = string
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,23 @@
1
+ module Poignant
2
+ module FrequencyDistribution
3
+
4
+ def frequency_distribution(tokens=false)
5
+ fd = {}
6
+ if tokens
7
+ unique_tokens.each do |token|
8
+ fd[token] = raw.scan(token).count
9
+ end
10
+ else
11
+ unique_words.each do |word|
12
+ fd[word] = normalized.scan(word).count
13
+ end
14
+ end
15
+ fd
16
+ end
17
+
18
+ def lexical_diversity
19
+ word_count * 1.0 / unique_words.count
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,19 @@
1
+ module Poignant
2
+ module NGrams
3
+ def bigrams
4
+ ngrams(2)
5
+ end
6
+
7
+ def trigrams
8
+ ngrams(3)
9
+ end
10
+
11
+ def ngrams(num)
12
+ ngrams = []
13
+ for i in num-1..(words.length-1)
14
+ ngrams << words[i-(num-1)..i]
15
+ end
16
+ ngrams
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,8 @@
1
+ module Poignant
2
+ module SimpleTokenizer
3
+ def self.tokenize(string)
4
+ string.split
5
+ end
6
+ end
7
+ end
8
+
@@ -0,0 +1,17 @@
1
+ module Poignant
2
+ module TokenOperations
3
+ def tokens
4
+ @raw.split
5
+ end
6
+
7
+ def unique_tokens
8
+ tokens.uniq
9
+ end
10
+
11
+ def token_count
12
+ tokens.count
13
+ end
14
+ end
15
+ end
16
+
17
+
@@ -0,0 +1,11 @@
1
+ module Poignant
2
+ module TokenizerInterface
3
+ def tokenize(string)
4
+ raise NotImplementedError, "Implementing 'tokenize' is required for this class."
5
+ end
6
+
7
+ def span_tokenize(string)
8
+ raise NotImplementedError, "Implementing 'span_tokenize' is required for this class."
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module Poignant
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ module Poignant
2
+ module WordOperations
3
+ def words
4
+ normalized.split
5
+ end
6
+
7
+ def unique_words
8
+ words.uniq
9
+ end
10
+
11
+ def word_count
12
+ words.count
13
+ end
14
+
15
+ def normalized
16
+ @raw.downcase.gsub(/[^a-z ]/, '')
17
+ end
18
+ end
19
+ end
20
+
data/lib/poignant.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "poignant/version"
2
+ require "poignant/corpus"
3
+ require "poignant/document"
4
+ require "poignant/simple_tokenizer"
5
+
6
+ module Poignant
7
+ # Your code goes here...
8
+ end
data/poignant.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'poignant/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "poignant"
8
+ gem.version = Poignant::VERSION
9
+ gem.authors = ["Aaron Massey"]
10
+ gem.email = ["akmassey@sixlines.org"]
11
+ gem.description = %q{Poignant is a toolkit for natural language processing in Ruby.}
12
+ gem.summary = %q{Poignant is a toolkit for natural language processing in Ruby. It is named in honor of why the lucky stiff.}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'rspec'
21
+ gem.add_development_dependency 'rake'
22
+ end
@@ -0,0 +1,60 @@
1
+ require 'poignant'
2
+
3
+ module Poignant
4
+
5
+ describe Corpus do
6
+ before(:each) do
7
+ @small = "This is a small sentence."
8
+ @repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
9
+ @punctuation = "This is a small sentence. It includes? Punctuation!"
10
+
11
+ @document_small = Document.new(@small)
12
+ @document_repeating = Document.new(@repeating)
13
+ end
14
+
15
+ it "should be initialized with an array of documents" do
16
+ document = Document.new(@small)
17
+ another = Document.new(@repeating)
18
+ lambda { Corpus.new([document, another]) }.should_not raise_error
19
+ end
20
+
21
+ it "should be able to add a document to the collection" do
22
+ first = Document.new(@small)
23
+ corpus = Corpus.new([first])
24
+ lambda { corpus.add(Document.new(@repeating)) }.should_not raise_error
25
+ end
26
+
27
+ it "should be able to calculate a lexical diversity" do
28
+ corpus = Corpus.new([@document_small])
29
+ corpus.lexical_diversity.should eq 1.0
30
+ another = Corpus.new([@document_repeating])
31
+ another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
32
+ end
33
+
34
+ # it "should be able to calculate a frequency distribution" do
35
+ # corpus = Corpus.new(@repeating)
36
+ # corpus.frequency_distribution["repeated"].should eq 2
37
+ # corpus.frequency_distribution["small"].should eq 2
38
+ # corpus.frequency_distribution["this"].should eq 2
39
+ # corpus.frequency_distribution["are"].should eq 1
40
+ # end
41
+
42
+ # it "should be able to calculate a frequency distribution for tokens" do
43
+ # corpus = Corpus.new(@repeating)
44
+ # corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
45
+ # corpus.frequency_distribution(tokens=true)["words."].should eq 1
46
+ # end
47
+
48
+ # it "should be able to remove punctuation" do
49
+ # Corpus.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
50
+ # end
51
+
52
+ # it "should be able to calculate ngrams" do
53
+ # bigrams = Corpus.new(@small).ngrams(2)
54
+ # bigrams.should_not be_nil
55
+ # bigrams[0].should eq ["this", "is"]
56
+ # bigrams[1].should eq ["is", "a"]
57
+ # bigrams[2].should eq ["a", "small"]
58
+ # end
59
+ end
60
+ end
@@ -0,0 +1,55 @@
1
+ require 'poignant'
2
+
3
+ module Poignant
4
+
5
+ describe Document do
6
+ before(:each) do
7
+ @small = "This is a small sentence."
8
+ @repeating = "This is a small sentence with repeated words. Words are repeated in this small sentence."
9
+ @punctuation = "This is a small sentence. It includes? Punctuation!"
10
+ end
11
+
12
+ it "should be initialized with a string" do
13
+ lambda { Document.new(@small) }.should_not raise_error
14
+ end
15
+
16
+ it "should be able to calculate a lexical diversity" do
17
+ corpus = Document.new(@small)
18
+ corpus.lexical_diversity.should eq 1.0
19
+ another = Document.new(@repeating)
20
+ another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
21
+ end
22
+
23
+ it "should be able to calculate unique tokens" do
24
+ document = Document.new(@repeating)
25
+ document.unique_tokens.count.should eq 13
26
+ end
27
+
28
+ it "should be able to calculate a frequency distribution" do
29
+ corpus = Document.new(@repeating)
30
+ corpus.frequency_distribution["repeated"].should eq 2
31
+ corpus.frequency_distribution["small"].should eq 2
32
+ corpus.frequency_distribution["this"].should eq 2
33
+ corpus.frequency_distribution["are"].should eq 1
34
+ end
35
+
36
+ it "should be able to calculate a frequency distribution for tokens" do
37
+ corpus = Document.new(@repeating)
38
+ corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
39
+ corpus.frequency_distribution(tokens=true)["words."].should eq 1
40
+ end
41
+
42
+ it "should be able to remove punctuation" do
43
+ Document.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
44
+ end
45
+
46
+ # it "should be able to calculate ngrams" do
47
+ # bigrams = Document.new(@small).ngrams(2)
48
+ # bigrams.should_not be_nil
49
+ # bigrams[0].should eq ["this", "is"]
50
+ # bigrams[1].should eq ["is", "a"]
51
+ # bigrams[2].should eq ["a", "small"]
52
+ # end
53
+ end
54
+ end
55
+
@@ -0,0 +1,15 @@
1
+ require 'poignant'
2
+
3
+ module Poignant
4
+
5
+ describe SimpleTokenizer do
6
+
7
+ it "should be able to tokenize based on spaces" do
8
+ text = "This is a sentence."
9
+ result = ["This", "is", "a", "sentence."]
10
+ SimpleTokenizer.tokenize(text).should eq result
11
+ end
12
+
13
+ end
14
+
15
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: poignant
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Massey
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Poignant is a toolkit for natural language processing in Ruby.
42
+ email:
43
+ - akmassey@sixlines.org
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - lib/poignant.rb
54
+ - lib/poignant/corpus.rb
55
+ - lib/poignant/document.rb
56
+ - lib/poignant/frequency_distribution.rb
57
+ - lib/poignant/ngrams.rb
58
+ - lib/poignant/simple_tokenizer.rb
59
+ - lib/poignant/token_operations.rb
60
+ - lib/poignant/tokenizer_interface.rb
61
+ - lib/poignant/version.rb
62
+ - lib/poignant/word_operations.rb
63
+ - poignant.gemspec
64
+ - spec/corpus_spec.rb
65
+ - spec/document_spec.rb
66
+ - spec/tokenizers_spec.rb
67
+ homepage: ''
68
+ licenses: []
69
+ metadata: {}
70
+ post_install_message:
71
+ rdoc_options: []
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 2.0.3
87
+ signing_key:
88
+ specification_version: 4
89
+ summary: Poignant is a toolkit for natural language processing in Ruby. It is named
90
+ in honor of why the lucky stiff.
91
+ test_files:
92
+ - spec/corpus_spec.rb
93
+ - spec/document_spec.rb
94
+ - spec/tokenizers_spec.rb