strabo 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ *~
7
+ *.gem
8
+ tmp
9
+ .yardoc
10
+ doc/*
data/Rakefile ADDED
@@ -0,0 +1,36 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gemspec|
7
+ gemspec.name = "strabo"
8
+ gemspec.summary = "Full text search utilities for Ruby"
9
+ gemspec.description = "Simplified tokenization, stemming, and term-frequency map indexes"
10
+ gemspec.email = "jon.morton@gmail.com "
11
+ gemspec.homepage = "http://github.com/jmorton/strabo"
12
+ gemspec.authors = ["Jon Morton"]
13
+ end
14
+ Jeweler::GemcutterTasks.new
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: gem install jeweler"
17
+ end
18
+
19
+ begin
20
+ require 'YARD'
21
+ YARD::Rake::YardocTask.new do |t|
22
+ t.files = ['lib/**/*.rb']
23
+ end
24
+ rescue LoadError
25
+ puts "Yard not available. Install it with: gem install yard"
26
+ end
27
+
28
+ begin
29
+ require 'cucumber'
30
+ require 'cucumber/rake/task'
31
+ Cucumber::Rake::Task.new(:features) do |t|
32
+ t.cucumber_opts = "features --format pretty"
33
+ end
34
+ rescue LoadError
35
+ puts "Cucumber not available. Install it with: gem install cucumber"
36
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/examples/book.rb ADDED
@@ -0,0 +1,10 @@
1
+ require '../lib/strabo'
2
+
3
+ class Book < Hash
4
+ include Strabo::Indexer
5
+ end
6
+
7
+ book = Book.new
8
+ book['title'] = 'Learn to Program'
9
+ book['author'] = 'Chris Pine'
10
+ book.keywords
@@ -0,0 +1,64 @@
1
+ Feature: Index a document
2
+ In order to make text searchable
3
+ A document's contents should be indexed
4
+
5
+ Background:
6
+ Given hash includes indexer
7
+
8
+ Scenario: Indexing
9
+ Given a hash with
10
+ | title | Strabo's Geographica |
11
+ | author | Strabo of Amaseia |
12
+ | summary | Strabo was born to an affluent family from Amaseia in Pontus |
13
+ Then it should have the following "title" term frequencies:
14
+ | term | frequency |
15
+ | geographica | 1 |
16
+ Then it should have the following "author" term frequencies:
17
+ | term | frequency |
18
+ | strabo | 1 |
19
+ | of | 1 |
20
+ | amaseia | 1 |
21
+ Then it should have the following "summary" term frequencies:
22
+ | term | frequency |
23
+ | strabo | 1 |
24
+ | was | 1 |
25
+ | born | 1 |
26
+ | to | 1 |
27
+ | an | 1 |
28
+ | affluent | 1 |
29
+ Then it should have the following term frequencies:
30
+ | term | frequency |
31
+ | strabo | 2 |
32
+ | amaseia | 2 |
33
+ | geographica | 1 |
34
+
35
+ Scenario: Abstract indexing
36
+ Given a hash with
37
+ | foo | x y x y x y |
38
+ | bar | x y z x y z |
39
+ | baz | a b c x y z |
40
+ Then it should have the following "foo" term frequencies:
41
+ | T | F |
42
+ | x | 3 |
43
+ | x | 3 |
44
+ Then it should have the following "bar" term frequencies:
45
+ | T | F |
46
+ | x | 2 |
47
+ | y | 2 |
48
+ | z | 2 |
49
+ Then it should have the following "baz" term frequencies:
50
+ | T | F |
51
+ | x | 1 |
52
+ | y | 1 |
53
+ | z | 1 |
54
+ | a | 1 |
55
+ | b | 1 |
56
+ | c | 1 |
57
+ Then it should have the following term frequencies:
58
+ | T | F |
59
+ | x | 6 |
60
+ | y | 6 |
61
+ | z | 3 |
62
+ | a | 1 |
63
+ | b | 1 |
64
+ | c | 1 |
@@ -0,0 +1,23 @@
1
+ Feature: Stem tokens
2
+ In order to increase recall
3
+ A document's tokens may be stemmed
4
+
5
+ Background:
6
+ Given hash includes indexer
7
+ And stemming uses the "ruby-stemmer" gem
8
+
9
+ Scenario: Stemming tokens
10
+ Given a hash with
11
+ | foo | jump jumps jumping jumper |
12
+ | bar | ran run running runner |
13
+ Then it should have the following "foo" term frequencies:
14
+ | T | F |
15
+ | jump | 3 |
16
+ | jumper | 1 |
17
+ Then it should have the following "bar" term frequencies:
18
+ | T | F |
19
+ | run | 2 |
20
+ | ran | 1 |
21
+ | runner | 1 |
22
+
23
+
@@ -0,0 +1,37 @@
1
+ Given 'hash includes indexer' do
2
+ class Hash
3
+ include Strabo::Indexer
4
+ end
5
+ end
6
+
7
+ Given /a hash with/ do |table|
8
+ @context = {}
9
+ table.rows_hash.each do |keys, values|
10
+ @context[keys] = values
11
+ end
12
+ end
13
+
14
+ Then /it should have the following term frequencies:/ do |table|
15
+ begin
16
+ table.map_headers!('T' => 'term', 'F' => 'frequency')
17
+ rescue
18
+ # no big deal
19
+ end
20
+
21
+ table.hashes.each do |row|
22
+ @context.keywords(true)[row['term']].should eql(row['frequency'].to_i)
23
+ end
24
+ end
25
+
26
+ Then /it should have the following "(.+)" term frequencies:/ do |attribute, table|
27
+ begin
28
+ table.map_headers!('T' => 'term', 'F' => 'frequency')
29
+ rescue
30
+ # no big deal
31
+ end
32
+
33
+ table.hashes.each do |row|
34
+ @context.keywords[attribute].keys.should include(row['term'])
35
+ @context.keywords[attribute][row['term']].should eql(row['frequency'].to_i)
36
+ end
37
+ end
@@ -0,0 +1,6 @@
1
+ Given /stemming uses the "ruby-stemmer" gem/ do
2
+ require 'rubygems'
3
+ require 'lingua/stemmer'
4
+ Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
5
+ end
6
+
@@ -0,0 +1,5 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/strabo')
2
+
3
+ require 'cucumber/formatter/unicode' # Remove this line if you don't want Cucumber Unicode support
4
+ require 'cucumber/web/tableish'
5
+ require 'spec/expectations'
data/lib/strabo.rb ADDED
@@ -0,0 +1,115 @@
1
+ require 'Set'
2
+
3
+ # Strabo assists full text search indexing by generating term-frequency maps
4
+ # for an object's attributes. The term-frequency map may be flattened into
5
+ # an index for the entire object.
6
+ #
7
+ # Strabo was written with MongoDB in mind. The idea is that a document will
8
+ # store its own embedded keyword index that MongoDB can use for full text
9
+ # search.
10
+ #
11
+ # @example: Using strabo
12
+ # class Book < Hash
13
+ # include Strabo
14
+ # end
15
+ #
16
+ # book = Book.new
17
+ # book['title'] = 'Learn to Program'
18
+ # book['author'] = 'Chris Pine'
19
+ # b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
20
+ #
21
+ # @author: Jon Morton
22
+ #
23
+ module Strabo
24
+
25
+ # Stemming configuration. By default, Strabo performs no stemming.
26
+ #
27
+ # @example: Configuring stemming
28
+ # require 'rubygems'
29
+ # require 'lingua/stemmer'
30
+ # Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
31
+ #
32
+ # @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github
33
+ #
34
+ module Stemmer
35
+
36
+ # Set the stemmer used during tokenization.
37
+ #
38
+ # @param [lambda] stemmer called with individual tokens
39
+ #
40
+ # @see Strabo#stem
41
+ def self.stemmer=(stemmer)
42
+ @stemmer = stemmer
43
+ end
44
+
45
+ # Invokes stemmer on token. If no stemmer has been configured, it will
46
+ # return the original token.
47
+ #
48
+ # @param [String] token
49
+ #
50
+ # @return [String] result of stemming
51
+ #
52
+ # @see Strabo#stemmer
53
+ def self.stem(token)
54
+ @stemmer.nil? ? token : @stemmer.call(token)
55
+ end
56
+ end
57
+
58
+ module Indexer
59
+
60
+ # Get attribute-term-frequency map. If flattened, a term-frequency map
61
+ # without the context of the attribute.
62
+ #
63
+ # @param [TrueClass, FalseClass] flatten
64
+ #
65
+ # @return [Hash] { attribute => { term => frequency } } or
66
+ # { term => frequency } map.
67
+ def keywords(flatten = false)
68
+ @term_map = {}
69
+ self.each { |key, value| @term_map[key] = frequency(tokenize(value)) }
70
+ flatten ? flatten_keyword_map(@term_map) : @term_map
71
+ end
72
+
73
+ private
74
+
75
+ # Break a string into a list of strings.
76
+ #
77
+ # @param [String] text to convert into a list
78
+ # @param [Regex] delimiter used to scan the string
79
+ #
80
+ # @return [Array] list of stemmed terms
81
+ #
82
+ # @private
83
+ def tokenize(value, delimiter = /\S+/)
84
+ value.downcase.scan(delimiter).map { |token| Strabo::Stemmer.stem(token) }
85
+ end
86
+
87
+ # Tally the number of occurrences of a value in a list.
88
+ #
89
+ # @param [Array] list of terms to count
90
+ #
91
+ # @return [Hash] term-frequency map
92
+ #
93
+ # @private
94
+ def frequency(values)
95
+ values.inject(Hash.new) do |h, term|
96
+ h[term] = (h[term].nil?) ? (1) : (h[term] + 1)
97
+ h
98
+ end
99
+ end
100
+
101
+ # @see Strabo::Indexer#keywords
102
+ #
103
+ # @private
104
+ def flatten_keyword_map(map)
105
+ h = {}
106
+ map.each do |att, terms|
107
+ terms.each do |term, frequency|
108
+ h[term] = (h[term] || 0) + frequency
109
+ end
110
+ end
111
+ h
112
+ end
113
+
114
+ end
115
+ end
data/readme.markdown ADDED
@@ -0,0 +1,14 @@
1
+ # Strabo
2
+
3
+ ## About
4
+
5
+ Strabo makes preparing a Ruby object for full text search by tokenizing an objects attributes.
6
+
7
+ class Book < Hash
8
+ include Strabo::Indexer
9
+ end
10
+
11
+ book = Book.new
12
+ book['title'] = 'Learn to Program'
13
+ book['author'] = 'Chris Pine'
14
+ book.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: strabo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jon Morton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-28 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Simplified tokenization, stemming, and term-frequency map indexes
17
+ email: "jon.morton@gmail.com "
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - .gitignore
26
+ - Rakefile
27
+ - VERSION
28
+ - examples/book.rb
29
+ - features/index.feature
30
+ - features/stemming.feature
31
+ - features/steps/index_steps.rb
32
+ - features/steps/stemmer_steps.rb
33
+ - features/support/env.rb
34
+ - lib/strabo.rb
35
+ - readme.markdown
36
+ has_rdoc: true
37
+ homepage: http://github.com/jmorton/strabo
38
+ licenses: []
39
+
40
+ post_install_message:
41
+ rdoc_options:
42
+ - --charset=UTF-8
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ version:
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.3.5
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Full text search utilities for Ruby
64
+ test_files:
65
+ - examples/book.rb