RubyGems - strabo - Versions diffs - 0.0.0 - Mend

Files changed (12) hide show

data/.gitignore +10 -0
data/Rakefile +36 -0
data/VERSION +1 -0
data/examples/book.rb +10 -0
data/features/index.feature +64 -0
data/features/stemming.feature +23 -0
data/features/steps/index_steps.rb +37 -0
data/features/steps/stemmer_steps.rb +6 -0
data/features/support/env.rb +5 -0
data/lib/strabo.rb +115 -0
data/readme.markdown +14 -0
metadata +65 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg
+*~
+*.gem
+tmp
+.yardoc
+doc/*

data/Rakefile ADDED Viewed

@@ -0,0 +1,36 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gemspec|
+    gemspec.name = "strabo"
+    gemspec.summary = "Full text search utilities for Ruby"
+    gemspec.description = "Simplified tokenization, stemming, and term-frequency map indexes"
+    gemspec.email = "jon.morton@gmail.com "
+    gemspec.homepage = "http://github.com/jmorton/strabo"
+    gemspec.authors = ["Jon Morton"]
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler not available. Install it with: gem install jeweler"
+end
+begin
+  require 'YARD'
+  YARD::Rake::YardocTask.new do |t|
+    t.files   = ['lib/**/*.rb']
+  end
+rescue LoadError
+  puts "Yard not available. Install it with: gem install yard"
+end
+begin
+  require 'cucumber'
+  require 'cucumber/rake/task'
+  Cucumber::Rake::Task.new(:features) do |t|
+    t.cucumber_opts = "features --format pretty"
+  end
+rescue LoadError
+  puts "Cucumber not available. Install it with: gem install cucumber"
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.0

data/examples/book.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require '../lib/strabo'
+class Book < Hash
+  include Strabo::Indexer
+end
+book = Book.new
+book['title'] = 'Learn to Program'
+book['author'] = 'Chris Pine'
+book.keywords

data/features/index.feature ADDED Viewed

@@ -0,0 +1,64 @@
+Feature: Index a document
+  In order to make text searchable
+  A document's contents should be indexed
+  Background:
+    Given hash includes indexer
+  Scenario: Indexing
+    Given a hash with
+      | title   | Strabo's Geographica                                         |
+      | author  | Strabo of Amaseia                                            |
+      | summary | Strabo was born to an affluent family from Amaseia in Pontus |
+    Then it should have the following "title" term frequencies:
+      | term         | frequency |
+      | geographica  | 1         |
+    Then it should have the following "author" term frequencies:
+      | term         | frequency |
+      | strabo       | 1         |
+      | of           | 1         |
+      | amaseia      | 1         |
+    Then it should have the following "summary" term frequencies:
+      | term         | frequency |
+      | strabo       | 1         |
+      | was          | 1         |
+      | born         | 1         |
+      | to           | 1         |
+      | an           | 1         |
+      | affluent     | 1         |
+    Then it should have the following term frequencies:
+      | term         | frequency |
+      | strabo       | 2         |
+      | amaseia      | 2         |
+      | geographica  | 1         |
+  Scenario: Abstract indexing
+    Given a hash with
+      | foo   | x y x y x y |
+      | bar   | x y z x y z |
+      | baz   | a b c x y z |
+    Then it should have the following "foo" term frequencies:
+      | T | F |
+      | x | 3 |
+      | x | 3 |
+    Then it should have the following "bar" term frequencies:
+      | T | F |
+      | x | 2 |
+      | y | 2 |
+      | z | 2 |
+    Then it should have the following "baz" term frequencies:
+      | T | F |
+      | x | 1 |
+      | y | 1 |
+      | z | 1 |
+      | a | 1 |
+      | b | 1 |
+      | c | 1 |
+    Then it should have the following term frequencies:
+      | T | F |
+      | x | 6 |
+      | y | 6 |
+      | z | 3 |
+      | a | 1 |
+      | b | 1 |
+      | c | 1 |

data/features/stemming.feature ADDED Viewed

@@ -0,0 +1,23 @@
+Feature: Stem tokens
+  In order to increase recall
+  A document's tokens may be stemmed
+  Background:
+    Given hash includes indexer
+    And stemming uses the "ruby-stemmer" gem
+  Scenario: Stemming tokens
+    Given a hash with
+      | foo   | jump jumps jumping jumper  |
+      | bar   | ran run running runner     |
+    Then it should have the following "foo" term frequencies:
+      | T      | F  |
+      | jump   | 3  |
+      | jumper | 1  |
+    Then it should have the following "bar" term frequencies:
+      | T      | F  |
+      | run    | 2  |
+      | ran    | 1  |
+      | runner | 1  |

data/features/steps/index_steps.rb ADDED Viewed

@@ -0,0 +1,37 @@
+Given 'hash includes indexer' do
+  class Hash
+    include Strabo::Indexer
+  end
+end
+Given /a hash with/ do |table|
+  @context = {}
+  table.rows_hash.each do |keys, values|
+    @context[keys] = values
+  end
+end
+Then /it should have the following term frequencies:/ do |table|
+  begin
+    table.map_headers!('T' => 'term', 'F' => 'frequency')
+  rescue
+    # no big deal
+  end
+  table.hashes.each do |row|
+    @context.keywords(true)[row['term']].should eql(row['frequency'].to_i)
+  end
+end
+Then /it should have the following "(.+)" term frequencies:/ do |attribute, table|
+  begin
+    table.map_headers!('T' => 'term', 'F' => 'frequency')
+  rescue
+    # no big deal
+  end
+  table.hashes.each do |row|
+    @context.keywords[attribute].keys.should include(row['term'])
+    @context.keywords[attribute][row['term']].should eql(row['frequency'].to_i)
+  end
+end

data/features/steps/stemmer_steps.rb ADDED Viewed

@@ -0,0 +1,6 @@
+Given /stemming uses the "ruby-stemmer" gem/ do
+  require 'rubygems'
+  require 'lingua/stemmer'
+  Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
+end

data/features/support/env.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require File.expand_path(File.dirname(__FILE__) + '/../../lib/strabo')
+require 'cucumber/formatter/unicode' # Remove this line if you don't want Cucumber Unicode support
+require 'cucumber/web/tableish'
+require 'spec/expectations'

data/lib/strabo.rb ADDED Viewed

@@ -0,0 +1,115 @@
+require 'Set'
+# Strabo assists full text search indexing by generating term-frequency maps
+# for an object's attributes.  The term-frequency map may be flattened into
+# an index for the entire object.
+#
+# Strabo was written with MongoDB in mind.  The idea is that a document will
+# store its own embedded keyword index that MongoDB can use for full text
+# search.
+#
+# @example: Using strabo
+#   class Book < Hash
+#     include Strabo
+#   end
+#
+#   book = Book.new
+#   book['title'] = 'Learn to Program'
+#   book['author'] = 'Chris Pine'
+#   b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
+#
+# @author: Jon Morton
+#
+module Strabo
+  # Stemming configuration.  By default, Strabo performs no stemming.
+  #
+  # @example: Configuring stemming
+  #  require 'rubygems'
+  #  require 'lingua/stemmer'
+  #  Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
+  #
+  # @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github
+  #
+  module Stemmer
+    # Set the stemmer used during tokenization.
+    #
+    # @param [lambda] stemmer called with individual tokens
+    #
+    # @see Strabo#stem
+    def self.stemmer=(stemmer)
+      @stemmer = stemmer
+    end
+    # Invokes stemmer on token.  If no stemmer has been configured, it will
+    # return the original token.
+    #
+    # @param [String] token
+    #
+    # @return [String] result of stemming
+    #
+    # @see Strabo#stemmer
+    def self.stem(token)
+      @stemmer.nil? ? token : @stemmer.call(token)
+    end
+  end
+  module Indexer
+    # Get attribute-term-frequency map. If flattened, a term-frequency map
+    # without the context of the attribute.
+    #
+    # @param [TrueClass, FalseClass] flatten
+    #
+    # @return [Hash] { attribute => { term => frequency } } or
+    #                { term => frequency } map.
+    def keywords(flatten = false)
+      @term_map = {}
+      self.each { |key, value| @term_map[key] = frequency(tokenize(value)) }
+      flatten ? flatten_keyword_map(@term_map) : @term_map
+    end
+  private
+    # Break a string into a list of strings.
+    #
+    # @param [String] text to convert into a list
+    # @param [Regex] delimiter used to scan the string
+    #
+    # @return [Array] list of stemmed terms
+    #
+    # @private
+    def tokenize(value, delimiter = /\S+/)
+      value.downcase.scan(delimiter).map { |token| Strabo::Stemmer.stem(token) }
+    end
+    # Tally the number of occurrences of a value in a list.
+    #
+    # @param [Array] list of terms to count
+    #
+    # @return [Hash] term-frequency map
+    #
+    # @private
+    def frequency(values)
+      values.inject(Hash.new) do |h, term|
+        h[term] = (h[term].nil?) ? (1) : (h[term] + 1)
+        h
+      end
+    end
+    # @see Strabo::Indexer#keywords
+    #
+    # @private
+    def flatten_keyword_map(map)
+      h = {}
+      map.each do |att, terms|
+        terms.each do |term, frequency|
+          h[term] = (h[term] || 0) + frequency
+        end
+      end
+      h
+    end
+  end
+end

data/readme.markdown ADDED Viewed

@@ -0,0 +1,14 @@
+# Strabo
+## About
+Strabo makes preparing a Ruby object for full text search by tokenizing an objects attributes.
+    class Book < Hash
+      include Strabo::Indexer
+    end
+    book = Book.new
+    book['title'] = 'Learn to Program'
+    book['author'] = 'Chris Pine'
+    book.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}

metadata ADDED Viewed

@@ -0,0 +1,65 @@
+--- !ruby/object:Gem::Specification
+name: strabo
+version: !ruby/object:Gem::Version
+  version: 0.0.0
+platform: ruby
+authors:
+- Jon Morton
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-03-28 00:00:00 -04:00
+default_executable:
+dependencies: []
+description: Simplified tokenization, stemming, and term-frequency map indexes
+email: "jon.morton@gmail.com "
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Rakefile
+- VERSION
+- examples/book.rb
+- features/index.feature
+- features/stemming.feature
+- features/steps/index_steps.rb
+- features/steps/stemmer_steps.rb
+- features/support/env.rb
+- lib/strabo.rb
+- readme.markdown
+has_rdoc: true
+homepage: http://github.com/jmorton/strabo
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Full text search utilities for Ruby
+test_files:
+- examples/book.rb

strabo 0.0.0