strabo 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ *~
7
+ *.gem
8
+ tmp
9
+ .yardoc
10
+ doc/*
data/Rakefile ADDED
@@ -0,0 +1,36 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gemspec|
7
+ gemspec.name = "strabo"
8
+ gemspec.summary = "Full text search utilities for Ruby"
9
+ gemspec.description = "Simplified tokenization, stemming, and term-frequency map indexes"
10
+ gemspec.email = "jon.morton@gmail.com "
11
+ gemspec.homepage = "http://github.com/jmorton/strabo"
12
+ gemspec.authors = ["Jon Morton"]
13
+ end
14
+ Jeweler::GemcutterTasks.new
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: gem install jeweler"
17
+ end
18
+
19
+ begin
20
+ require 'YARD'
21
+ YARD::Rake::YardocTask.new do |t|
22
+ t.files = ['lib/**/*.rb']
23
+ end
24
+ rescue LoadError
25
+ puts "Yard not available. Install it with: gem install yard"
26
+ end
27
+
28
+ begin
29
+ require 'cucumber'
30
+ require 'cucumber/rake/task'
31
+ Cucumber::Rake::Task.new(:features) do |t|
32
+ t.cucumber_opts = "features --format pretty"
33
+ end
34
+ rescue LoadError
35
+ puts "Cucumber not available. Install it with: gem install cucumber"
36
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/examples/book.rb ADDED
@@ -0,0 +1,10 @@
1
+ require '../lib/strabo'
2
+
3
+ class Book < Hash
4
+ include Strabo::Indexer
5
+ end
6
+
7
+ book = Book.new
8
+ book['title'] = 'Learn to Program'
9
+ book['author'] = 'Chris Pine'
10
+ book.keywords
@@ -0,0 +1,64 @@
1
+ Feature: Index a document
2
+ In order to make text searchable
3
+ A document's contents should be indexed
4
+
5
+ Background:
6
+ Given hash includes indexer
7
+
8
+ Scenario: Indexing
9
+ Given a hash with
10
+ | title | Strabo's Geographica |
11
+ | author | Strabo of Amaseia |
12
+ | summary | Strabo was born to an affluent family from Amaseia in Pontus |
13
+ Then it should have the following "title" term frequencies:
14
+ | term | frequency |
15
+ | geographica | 1 |
16
+ Then it should have the following "author" term frequencies:
17
+ | term | frequency |
18
+ | strabo | 1 |
19
+ | of | 1 |
20
+ | amaseia | 1 |
21
+ Then it should have the following "summary" term frequencies:
22
+ | term | frequency |
23
+ | strabo | 1 |
24
+ | was | 1 |
25
+ | born | 1 |
26
+ | to | 1 |
27
+ | an | 1 |
28
+ | affluent | 1 |
29
+ Then it should have the following term frequencies:
30
+ | term | frequency |
31
+ | strabo | 2 |
32
+ | amaseia | 2 |
33
+ | geographica | 1 |
34
+
35
+ Scenario: Abstract indexing
36
+ Given a hash with
37
+ | foo | x y x y x y |
38
+ | bar | x y z x y z |
39
+ | baz | a b c x y z |
40
+ Then it should have the following "foo" term frequencies:
41
+ | T | F |
42
+ | x | 3 |
43
+ | x | 3 |
44
+ Then it should have the following "bar" term frequencies:
45
+ | T | F |
46
+ | x | 2 |
47
+ | y | 2 |
48
+ | z | 2 |
49
+ Then it should have the following "baz" term frequencies:
50
+ | T | F |
51
+ | x | 1 |
52
+ | y | 1 |
53
+ | z | 1 |
54
+ | a | 1 |
55
+ | b | 1 |
56
+ | c | 1 |
57
+ Then it should have the following term frequencies:
58
+ | T | F |
59
+ | x | 6 |
60
+ | y | 6 |
61
+ | z | 3 |
62
+ | a | 1 |
63
+ | b | 1 |
64
+ | c | 1 |
@@ -0,0 +1,23 @@
1
+ Feature: Stem tokens
2
+ In order to increase recall
3
+ A document's tokens may be stemmed
4
+
5
+ Background:
6
+ Given hash includes indexer
7
+ And stemming uses the "ruby-stemmer" gem
8
+
9
+ Scenario: Stemming tokens
10
+ Given a hash with
11
+ | foo | jump jumps jumping jumper |
12
+ | bar | ran run running runner |
13
+ Then it should have the following "foo" term frequencies:
14
+ | T | F |
15
+ | jump | 3 |
16
+ | jumper | 1 |
17
+ Then it should have the following "bar" term frequencies:
18
+ | T | F |
19
+ | run | 2 |
20
+ | ran | 1 |
21
+ | runner | 1 |
22
+
23
+
@@ -0,0 +1,37 @@
1
+ Given 'hash includes indexer' do
2
+ class Hash
3
+ include Strabo::Indexer
4
+ end
5
+ end
6
+
7
+ Given /a hash with/ do |table|
8
+ @context = {}
9
+ table.rows_hash.each do |keys, values|
10
+ @context[keys] = values
11
+ end
12
+ end
13
+
14
+ Then /it should have the following term frequencies:/ do |table|
15
+ begin
16
+ table.map_headers!('T' => 'term', 'F' => 'frequency')
17
+ rescue
18
+ # no big deal
19
+ end
20
+
21
+ table.hashes.each do |row|
22
+ @context.keywords(true)[row['term']].should eql(row['frequency'].to_i)
23
+ end
24
+ end
25
+
26
+ Then /it should have the following "(.+)" term frequencies:/ do |attribute, table|
27
+ begin
28
+ table.map_headers!('T' => 'term', 'F' => 'frequency')
29
+ rescue
30
+ # no big deal
31
+ end
32
+
33
+ table.hashes.each do |row|
34
+ @context.keywords[attribute].keys.should include(row['term'])
35
+ @context.keywords[attribute][row['term']].should eql(row['frequency'].to_i)
36
+ end
37
+ end
@@ -0,0 +1,6 @@
1
+ Given /stemming uses the "ruby-stemmer" gem/ do
2
+ require 'rubygems'
3
+ require 'lingua/stemmer'
4
+ Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
5
+ end
6
+
@@ -0,0 +1,5 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/strabo')
2
+
3
+ require 'cucumber/formatter/unicode' # Remove this line if you don't want Cucumber Unicode support
4
+ require 'cucumber/web/tableish'
5
+ require 'spec/expectations'
data/lib/strabo.rb ADDED
@@ -0,0 +1,115 @@
1
+ require 'Set'
2
+
3
+ # Strabo assists full text search indexing by generating term-frequency maps
4
+ # for an object's attributes. The term-frequency map may be flattened into
5
+ # an index for the entire object.
6
+ #
7
+ # Strabo was written with MongoDB in mind. The idea is that a document will
8
+ # store its own embedded keyword index that MongoDB can use for full text
9
+ # search.
10
+ #
11
+ # @example: Using strabo
12
+ # class Book < Hash
13
+ # include Strabo
14
+ # end
15
+ #
16
+ # book = Book.new
17
+ # book['title'] = 'Learn to Program'
18
+ # book['author'] = 'Chris Pine'
19
+ # b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
20
+ #
21
+ # @author: Jon Morton
22
+ #
23
+ module Strabo
24
+
25
+ # Stemming configuration. By default, Strabo performs no stemming.
26
+ #
27
+ # @example: Configuring stemming
28
+ # require 'rubygems'
29
+ # require 'lingua/stemmer'
30
+ # Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
31
+ #
32
+ # @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github
33
+ #
34
+ module Stemmer
35
+
36
+ # Set the stemmer used during tokenization.
37
+ #
38
+ # @param [lambda] stemmer called with individual tokens
39
+ #
40
+ # @see Strabo#stem
41
+ def self.stemmer=(stemmer)
42
+ @stemmer = stemmer
43
+ end
44
+
45
+ # Invokes stemmer on token. If no stemmer has been configured, it will
46
+ # return the original token.
47
+ #
48
+ # @param [String] token
49
+ #
50
+ # @return [String] result of stemming
51
+ #
52
+ # @see Strabo#stemmer
53
+ def self.stem(token)
54
+ @stemmer.nil? ? token : @stemmer.call(token)
55
+ end
56
+ end
57
+
58
+ module Indexer
59
+
60
+ # Get attribute-term-frequency map. If flattened, a term-frequency map
61
+ # without the context of the attribute.
62
+ #
63
+ # @param [TrueClass, FalseClass] flatten
64
+ #
65
+ # @return [Hash] { attribute => { term => frequency } } or
66
+ # { term => frequency } map.
67
+ def keywords(flatten = false)
68
+ @term_map = {}
69
+ self.each { |key, value| @term_map[key] = frequency(tokenize(value)) }
70
+ flatten ? flatten_keyword_map(@term_map) : @term_map
71
+ end
72
+
73
+ private
74
+
75
+ # Break a string into a list of strings.
76
+ #
77
+ # @param [String] text to convert into a list
78
+ # @param [Regex] delimiter used to scan the string
79
+ #
80
+ # @return [Array] list of stemmed terms
81
+ #
82
+ # @private
83
+ def tokenize(value, delimiter = /\S+/)
84
+ value.downcase.scan(delimiter).map { |token| Strabo::Stemmer.stem(token) }
85
+ end
86
+
87
+ # Tally the number of occurrences of a value in a list.
88
+ #
89
+ # @param [Array] list of terms to count
90
+ #
91
+ # @return [Hash] term-frequency map
92
+ #
93
+ # @private
94
+ def frequency(values)
95
+ values.inject(Hash.new) do |h, term|
96
+ h[term] = (h[term].nil?) ? (1) : (h[term] + 1)
97
+ h
98
+ end
99
+ end
100
+
101
+ # @see Strabo::Indexer#keywords
102
+ #
103
+ # @private
104
+ def flatten_keyword_map(map)
105
+ h = {}
106
+ map.each do |att, terms|
107
+ terms.each do |term, frequency|
108
+ h[term] = (h[term] || 0) + frequency
109
+ end
110
+ end
111
+ h
112
+ end
113
+
114
+ end
115
+ end
data/readme.markdown ADDED
@@ -0,0 +1,14 @@
1
+ # Strabo
2
+
3
+ ## About
4
+
5
+ Strabo makes preparing a Ruby object for full text search by tokenizing an objects attributes.
6
+
7
+ class Book < Hash
8
+ include Strabo::Indexer
9
+ end
10
+
11
+ book = Book.new
12
+ book['title'] = 'Learn to Program'
13
+ book['author'] = 'Chris Pine'
14
+ book.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: strabo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jon Morton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-28 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Simplified tokenization, stemming, and term-frequency map indexes
17
+ email: "jon.morton@gmail.com "
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - .gitignore
26
+ - Rakefile
27
+ - VERSION
28
+ - examples/book.rb
29
+ - features/index.feature
30
+ - features/stemming.feature
31
+ - features/steps/index_steps.rb
32
+ - features/steps/stemmer_steps.rb
33
+ - features/support/env.rb
34
+ - lib/strabo.rb
35
+ - readme.markdown
36
+ has_rdoc: true
37
+ homepage: http://github.com/jmorton/strabo
38
+ licenses: []
39
+
40
+ post_install_message:
41
+ rdoc_options:
42
+ - --charset=UTF-8
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ version:
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.3.5
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Full text search utilities for Ruby
64
+ test_files:
65
+ - examples/book.rb