RubyGems - poignant - Versions diffs - 0.0.1 - Mend

poignant 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/Gemfile +4 -0
data/LICENSE.txt +13 -0
data/README.md +33 -0
data/Rakefile +6 -0
data/lib/poignant/corpus.rb +35 -0
data/lib/poignant/document.rb +18 -0
data/lib/poignant/frequency_distribution.rb +23 -0
data/lib/poignant/ngrams.rb +19 -0
data/lib/poignant/simple_tokenizer.rb +8 -0
data/lib/poignant/token_operations.rb +17 -0
data/lib/poignant/tokenizer_interface.rb +11 -0
data/lib/poignant/version.rb +3 -0
data/lib/poignant/word_operations.rb +20 -0
data/lib/poignant.rb +8 -0
data/poignant.gemspec +22 -0
data/spec/corpus_spec.rb +60 -0
data/spec/document_spec.rb +55 -0
data/spec/tokenizers_spec.rb +15 -0
metadata +94 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 0f005ec458887532e94fc32a9f3c130039567cb5
+  data.tar.gz: 890379dc4360900fcdee8bec8d9dec2c1549fe0d
+SHA512:
+  metadata.gz: 2df2eff33931f6fefcbc18e183ddf346b2ae7bf9c7e0c5f5151023cd9e7cdda334427c033cb6fd34017fc4e3905683975dbf1d51b4c07006adf1a775ea7893e4
+  data.tar.gz: b11c5dcd3986163302bc51fc7c2989c82d9e6adff3e64ffc8945b9044be41755fff54a2a502d91c6eef066e68b342a16aa4dc4d3e1cf81f62be8438ea901d85e

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in poignant.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,13 @@
+Copyright (c) 2013 Aaron Massey
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/README.md ADDED Viewed

@@ -0,0 +1,33 @@
+# Poignant
+Poignant is a toolkit for natural language processing in Ruby.  It is named in
+honor of [why the lucky stiff][1].
+## Installation
+Add this line to your application's Gemfile:
+    gem 'poignant'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install poignant
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+[1]: http://en.wikipedia.org/wiki/Why_the_lucky_stiff

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/lib/poignant/corpus.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require "poignant/frequency_distribution"
+require "poignant/ngrams"
+require "poignant/word_operations"
+require "poignant/token_operations"
+module Poignant
+  class Corpus
+    include Poignant::FrequencyDistribution
+    include Poignant::NGrams
+    include Poignant::WordOperations
+    include Poignant::TokenOperations
+    attr_reader :collection
+    # this is kind of a hack; we want to remove this eventually
+    attr_reader :raw
+    def initialize(document_array)
+      @collection = document_array
+      prepare_raw
+    end
+    def add(document)
+      @collection << document
+      prepare_raw
+    end
+    def prepare_raw
+      @raw = ""
+      @collection.each do |c|
+        @raw += c.raw
+      end
+    end
+  end
+end

data/lib/poignant/document.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require "poignant/word_operations"
+require "poignant/frequency_distribution"
+require "poignant/token_operations"
+module Poignant
+  class Document
+    include Poignant::WordOperations
+    include Poignant::FrequencyDistribution
+    include Poignant::TokenOperations
+    attr_reader :raw
+    def initialize(string)
+      @raw = string
+    end
+  end
+end

data/lib/poignant/frequency_distribution.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Poignant
+  module FrequencyDistribution
+    def frequency_distribution(tokens=false)
+      fd = {}
+      if tokens
+        unique_tokens.each do |token|
+          fd[token] = raw.scan(token).count
+        end
+      else
+        unique_words.each do |word|
+          fd[word] = normalized.scan(word).count
+        end
+      end
+      fd
+    end
+    def lexical_diversity
+      word_count * 1.0 / unique_words.count
+    end
+  end
+end

data/lib/poignant/ngrams.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module Poignant
+  module NGrams
+    def bigrams
+      ngrams(2)
+    end
+    def trigrams
+      ngrams(3)
+    end
+    def ngrams(num)
+      ngrams = []
+      for i in num-1..(words.length-1)
+        ngrams << words[i-(num-1)..i]
+      end
+      ngrams
+    end
+  end
+end

data/lib/poignant/simple_tokenizer.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module Poignant
+  module SimpleTokenizer
+    def self.tokenize(string)
+      string.split
+    end
+  end
+end

data/lib/poignant/token_operations.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Poignant
+  module TokenOperations
+    def tokens
+      @raw.split
+    end
+    def unique_tokens
+      tokens.uniq
+    end
+    def token_count
+      tokens.count
+    end
+  end
+end

data/lib/poignant/tokenizer_interface.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Poignant
+  module TokenizerInterface
+    def tokenize(string)
+      raise NotImplementedError, "Implementing 'tokenize' is required for this class."
+    end
+    def span_tokenize(string)
+      raise NotImplementedError, "Implementing 'span_tokenize' is required for this class."
+    end
+  end
+end

data/lib/poignant/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Poignant
+  VERSION = "0.0.1"
+end

data/lib/poignant/word_operations.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module Poignant
+  module WordOperations
+    def words
+      normalized.split
+    end
+    def unique_words
+      words.uniq
+    end
+    def word_count
+      words.count
+    end
+    def normalized
+      @raw.downcase.gsub(/[^a-z ]/, '')
+    end
+  end
+end

data/lib/poignant.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require "poignant/version"
+require "poignant/corpus"
+require "poignant/document"
+require "poignant/simple_tokenizer"
+module Poignant
+  # Your code goes here...
+end

data/poignant.gemspec ADDED Viewed

@@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'poignant/version'
+Gem::Specification.new do |gem|
+  gem.name          = "poignant"
+  gem.version       = Poignant::VERSION
+  gem.authors       = ["Aaron Massey"]
+  gem.email         = ["akmassey@sixlines.org"]
+  gem.description   = %q{Poignant is a toolkit for natural language processing in Ruby.}
+  gem.summary       = %q{Poignant is a toolkit for natural language processing in Ruby.  It is named in honor of why the lucky stiff.}
+  gem.homepage      = ""
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.add_development_dependency 'rspec'
+  gem.add_development_dependency 'rake'
+end

data/spec/corpus_spec.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'poignant'
+module Poignant
+  describe Corpus do
+    before(:each) do
+      @small = "This is a small sentence."
+      @repeating = "This is a small sentence with repeated words.  Words are repeated in this small sentence."
+      @punctuation = "This is a small sentence. It includes? Punctuation!"
+      @document_small = Document.new(@small)
+      @document_repeating = Document.new(@repeating)
+    end
+    it "should be initialized with an array of documents" do
+      document = Document.new(@small)
+      another = Document.new(@repeating)
+      lambda { Corpus.new([document, another]) }.should_not raise_error
+    end
+    it "should be able to add a document to the collection" do
+      first = Document.new(@small)
+      corpus = Corpus.new([first])
+      lambda { corpus.add(Document.new(@repeating)) }.should_not raise_error
+    end
+    it "should be able to calculate a lexical diversity" do
+      corpus = Corpus.new([@document_small])
+      corpus.lexical_diversity.should eq 1.0
+      another = Corpus.new([@document_repeating])
+      another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
+    end
+    # it "should be able to calculate a frequency distribution" do
+      # corpus = Corpus.new(@repeating)
+      # corpus.frequency_distribution["repeated"].should eq 2
+      # corpus.frequency_distribution["small"].should eq 2
+      # corpus.frequency_distribution["this"].should eq 2
+      # corpus.frequency_distribution["are"].should eq 1
+    # end
+    # it "should be able to calculate a frequency distribution for tokens" do
+      # corpus = Corpus.new(@repeating)
+      # corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
+      # corpus.frequency_distribution(tokens=true)["words."].should eq 1
+    # end
+    # it "should be able to remove punctuation" do
+      # Corpus.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
+    # end
+    # it "should be able to calculate ngrams" do
+      # bigrams = Corpus.new(@small).ngrams(2)
+      # bigrams.should_not be_nil
+      # bigrams[0].should eq ["this", "is"]
+      # bigrams[1].should eq ["is", "a"]
+      # bigrams[2].should eq ["a", "small"]
+    # end
+  end
+end

data/spec/document_spec.rb ADDED Viewed

@@ -0,0 +1,55 @@
+require 'poignant'
+module Poignant
+  describe Document do
+    before(:each) do
+      @small = "This is a small sentence."
+      @repeating = "This is a small sentence with repeated words.  Words are repeated in this small sentence."
+      @punctuation = "This is a small sentence. It includes? Punctuation!"
+    end
+    it "should be initialized with a string" do
+      lambda { Document.new(@small) }.should_not raise_error
+    end
+    it "should be able to calculate a lexical diversity" do
+      corpus = Document.new(@small)
+      corpus.lexical_diversity.should eq 1.0
+      another = Document.new(@repeating)
+      another.lexical_diversity.should eq (15.0 / 10) # this words repeated small sentence
+    end
+    it "should be able to calculate unique tokens" do
+      document = Document.new(@repeating)
+      document.unique_tokens.count.should eq 13
+    end
+    it "should be able to calculate a frequency distribution" do
+      corpus = Document.new(@repeating)
+      corpus.frequency_distribution["repeated"].should eq 2
+      corpus.frequency_distribution["small"].should eq 2
+      corpus.frequency_distribution["this"].should eq 2
+      corpus.frequency_distribution["are"].should eq 1
+    end
+    it "should be able to calculate a frequency distribution for tokens" do
+      corpus = Document.new(@repeating)
+      corpus.frequency_distribution(tokens=true)["repeated"].should eq 2
+      corpus.frequency_distribution(tokens=true)["words."].should eq 1
+    end
+    it "should be able to remove punctuation" do
+      Document.new(@punctuation).normalized.should eq "this is a small sentence it includes punctuation"
+    end
+    # it "should be able to calculate ngrams" do
+      # bigrams = Document.new(@small).ngrams(2)
+      # bigrams.should_not be_nil
+      # bigrams[0].should eq ["this", "is"]
+      # bigrams[1].should eq ["is", "a"]
+      # bigrams[2].should eq ["a", "small"]
+    # end
+  end
+end

data/spec/tokenizers_spec.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'poignant'
+module Poignant
+  describe SimpleTokenizer do
+    it "should be able to tokenize based on spaces" do
+      text = "This is a sentence."
+      result = ["This", "is", "a", "sentence."]
+      SimpleTokenizer.tokenize(text).should eq result
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,94 @@
+--- !ruby/object:Gem::Specification
+name: poignant
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Aaron Massey
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-04-16 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Poignant is a toolkit for natural language processing in Ruby.
+email:
+- akmassey@sixlines.org
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- lib/poignant.rb
+- lib/poignant/corpus.rb
+- lib/poignant/document.rb
+- lib/poignant/frequency_distribution.rb
+- lib/poignant/ngrams.rb
+- lib/poignant/simple_tokenizer.rb
+- lib/poignant/token_operations.rb
+- lib/poignant/tokenizer_interface.rb
+- lib/poignant/version.rb
+- lib/poignant/word_operations.rb
+- poignant.gemspec
+- spec/corpus_spec.rb
+- spec/document_spec.rb
+- spec/tokenizers_spec.rb
+homepage: ''
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.3
+signing_key:
+specification_version: 4
+summary: Poignant is a toolkit for natural language processing in Ruby.  It is named
+  in honor of why the lucky stiff.
+test_files:
+- spec/corpus_spec.rb
+- spec/document_spec.rb
+- spec/tokenizers_spec.rb