strabo 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +10 -0
- data/Rakefile +36 -0
- data/VERSION +1 -0
- data/examples/book.rb +10 -0
- data/features/index.feature +64 -0
- data/features/stemming.feature +23 -0
- data/features/steps/index_steps.rb +37 -0
- data/features/steps/stemmer_steps.rb +6 -0
- data/features/support/env.rb +5 -0
- data/lib/strabo.rb +115 -0
- data/readme.markdown +14 -0
- metadata +65 -0
data/.gitignore
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gemspec|
|
7
|
+
gemspec.name = "strabo"
|
8
|
+
gemspec.summary = "Full text search utilities for Ruby"
|
9
|
+
gemspec.description = "Simplified tokenization, stemming, and term-frequency map indexes"
|
10
|
+
gemspec.email = "jon.morton@gmail.com "
|
11
|
+
gemspec.homepage = "http://github.com/jmorton/strabo"
|
12
|
+
gemspec.authors = ["Jon Morton"]
|
13
|
+
end
|
14
|
+
Jeweler::GemcutterTasks.new
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
begin
|
20
|
+
require 'YARD'
|
21
|
+
YARD::Rake::YardocTask.new do |t|
|
22
|
+
t.files = ['lib/**/*.rb']
|
23
|
+
end
|
24
|
+
rescue LoadError
|
25
|
+
puts "Yard not available. Install it with: gem install yard"
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'cucumber'
|
30
|
+
require 'cucumber/rake/task'
|
31
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
32
|
+
t.cucumber_opts = "features --format pretty"
|
33
|
+
end
|
34
|
+
rescue LoadError
|
35
|
+
puts "Cucumber not available. Install it with: gem install cucumber"
|
36
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/examples/book.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
Feature: Index a document
|
2
|
+
In order to make text searchable
|
3
|
+
A document's contents should be indexed
|
4
|
+
|
5
|
+
Background:
|
6
|
+
Given hash includes indexer
|
7
|
+
|
8
|
+
Scenario: Indexing
|
9
|
+
Given a hash with
|
10
|
+
| title | Strabo's Geographica |
|
11
|
+
| author | Strabo of Amaseia |
|
12
|
+
| summary | Strabo was born to an affluent family from Amaseia in Pontus |
|
13
|
+
Then it should have the following "title" term frequencies:
|
14
|
+
| term | frequency |
|
15
|
+
| geographica | 1 |
|
16
|
+
Then it should have the following "author" term frequencies:
|
17
|
+
| term | frequency |
|
18
|
+
| strabo | 1 |
|
19
|
+
| of | 1 |
|
20
|
+
| amaseia | 1 |
|
21
|
+
Then it should have the following "summary" term frequencies:
|
22
|
+
| term | frequency |
|
23
|
+
| strabo | 1 |
|
24
|
+
| was | 1 |
|
25
|
+
| born | 1 |
|
26
|
+
| to | 1 |
|
27
|
+
| an | 1 |
|
28
|
+
| affluent | 1 |
|
29
|
+
Then it should have the following term frequencies:
|
30
|
+
| term | frequency |
|
31
|
+
| strabo | 2 |
|
32
|
+
| amaseia | 2 |
|
33
|
+
| geographica | 1 |
|
34
|
+
|
35
|
+
Scenario: Abstract indexing
|
36
|
+
Given a hash with
|
37
|
+
| foo | x y x y x y |
|
38
|
+
| bar | x y z x y z |
|
39
|
+
| baz | a b c x y z |
|
40
|
+
Then it should have the following "foo" term frequencies:
|
41
|
+
| T | F |
|
42
|
+
| x | 3 |
|
43
|
+
| x | 3 |
|
44
|
+
Then it should have the following "bar" term frequencies:
|
45
|
+
| T | F |
|
46
|
+
| x | 2 |
|
47
|
+
| y | 2 |
|
48
|
+
| z | 2 |
|
49
|
+
Then it should have the following "baz" term frequencies:
|
50
|
+
| T | F |
|
51
|
+
| x | 1 |
|
52
|
+
| y | 1 |
|
53
|
+
| z | 1 |
|
54
|
+
| a | 1 |
|
55
|
+
| b | 1 |
|
56
|
+
| c | 1 |
|
57
|
+
Then it should have the following term frequencies:
|
58
|
+
| T | F |
|
59
|
+
| x | 6 |
|
60
|
+
| y | 6 |
|
61
|
+
| z | 3 |
|
62
|
+
| a | 1 |
|
63
|
+
| b | 1 |
|
64
|
+
| c | 1 |
|
@@ -0,0 +1,23 @@
|
|
1
|
+
Feature: Stem tokens
|
2
|
+
In order to increase recall
|
3
|
+
A document's tokens may be stemmed
|
4
|
+
|
5
|
+
Background:
|
6
|
+
Given hash includes indexer
|
7
|
+
And stemming uses the "ruby-stemmer" gem
|
8
|
+
|
9
|
+
Scenario: Stemming tokens
|
10
|
+
Given a hash with
|
11
|
+
| foo | jump jumps jumping jumper |
|
12
|
+
| bar | ran run running runner |
|
13
|
+
Then it should have the following "foo" term frequencies:
|
14
|
+
| T | F |
|
15
|
+
| jump | 3 |
|
16
|
+
| jumper | 1 |
|
17
|
+
Then it should have the following "bar" term frequencies:
|
18
|
+
| T | F |
|
19
|
+
| run | 2 |
|
20
|
+
| ran | 1 |
|
21
|
+
| runner | 1 |
|
22
|
+
|
23
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
Given 'hash includes indexer' do
|
2
|
+
class Hash
|
3
|
+
include Strabo::Indexer
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
Given /a hash with/ do |table|
|
8
|
+
@context = {}
|
9
|
+
table.rows_hash.each do |keys, values|
|
10
|
+
@context[keys] = values
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /it should have the following term frequencies:/ do |table|
|
15
|
+
begin
|
16
|
+
table.map_headers!('T' => 'term', 'F' => 'frequency')
|
17
|
+
rescue
|
18
|
+
# no big deal
|
19
|
+
end
|
20
|
+
|
21
|
+
table.hashes.each do |row|
|
22
|
+
@context.keywords(true)[row['term']].should eql(row['frequency'].to_i)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
Then /it should have the following "(.+)" term frequencies:/ do |attribute, table|
|
27
|
+
begin
|
28
|
+
table.map_headers!('T' => 'term', 'F' => 'frequency')
|
29
|
+
rescue
|
30
|
+
# no big deal
|
31
|
+
end
|
32
|
+
|
33
|
+
table.hashes.each do |row|
|
34
|
+
@context.keywords[attribute].keys.should include(row['term'])
|
35
|
+
@context.keywords[attribute][row['term']].should eql(row['frequency'].to_i)
|
36
|
+
end
|
37
|
+
end
|
data/lib/strabo.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'Set'
|
2
|
+
|
3
|
+
# Strabo assists full text search indexing by generating term-frequency maps
|
4
|
+
# for an object's attributes. The term-frequency map may be flattened into
|
5
|
+
# an index for the entire object.
|
6
|
+
#
|
7
|
+
# Strabo was written with MongoDB in mind. The idea is that a document will
|
8
|
+
# store its own embedded keyword index that MongoDB can use for full text
|
9
|
+
# search.
|
10
|
+
#
|
11
|
+
# @example: Using strabo
|
12
|
+
# class Book < Hash
|
13
|
+
# include Strabo
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# book = Book.new
|
17
|
+
# book['title'] = 'Learn to Program'
|
18
|
+
# book['author'] = 'Chris Pine'
|
19
|
+
# b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
|
20
|
+
#
|
21
|
+
# @author: Jon Morton
|
22
|
+
#
|
23
|
+
module Strabo
|
24
|
+
|
25
|
+
# Stemming configuration. By default, Strabo performs no stemming.
|
26
|
+
#
|
27
|
+
# @example: Configuring stemming
|
28
|
+
# require 'rubygems'
|
29
|
+
# require 'lingua/stemmer'
|
30
|
+
# Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
|
31
|
+
#
|
32
|
+
# @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github
|
33
|
+
#
|
34
|
+
module Stemmer
|
35
|
+
|
36
|
+
# Set the stemmer used during tokenization.
|
37
|
+
#
|
38
|
+
# @param [lambda] stemmer called with individual tokens
|
39
|
+
#
|
40
|
+
# @see Strabo#stem
|
41
|
+
def self.stemmer=(stemmer)
|
42
|
+
@stemmer = stemmer
|
43
|
+
end
|
44
|
+
|
45
|
+
# Invokes stemmer on token. If no stemmer has been configured, it will
|
46
|
+
# return the original token.
|
47
|
+
#
|
48
|
+
# @param [String] token
|
49
|
+
#
|
50
|
+
# @return [String] result of stemming
|
51
|
+
#
|
52
|
+
# @see Strabo#stemmer
|
53
|
+
def self.stem(token)
|
54
|
+
@stemmer.nil? ? token : @stemmer.call(token)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
module Indexer
|
59
|
+
|
60
|
+
# Get attribute-term-frequency map. If flattened, a term-frequency map
|
61
|
+
# without the context of the attribute.
|
62
|
+
#
|
63
|
+
# @param [TrueClass, FalseClass] flatten
|
64
|
+
#
|
65
|
+
# @return [Hash] { attribute => { term => frequency } } or
|
66
|
+
# { term => frequency } map.
|
67
|
+
def keywords(flatten = false)
|
68
|
+
@term_map = {}
|
69
|
+
self.each { |key, value| @term_map[key] = frequency(tokenize(value)) }
|
70
|
+
flatten ? flatten_keyword_map(@term_map) : @term_map
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
# Break a string into a list of strings.
|
76
|
+
#
|
77
|
+
# @param [String] text to convert into a list
|
78
|
+
# @param [Regex] delimiter used to scan the string
|
79
|
+
#
|
80
|
+
# @return [Array] list of stemmed terms
|
81
|
+
#
|
82
|
+
# @private
|
83
|
+
def tokenize(value, delimiter = /\S+/)
|
84
|
+
value.downcase.scan(delimiter).map { |token| Strabo::Stemmer.stem(token) }
|
85
|
+
end
|
86
|
+
|
87
|
+
# Tally the number of occurrences of a value in a list.
|
88
|
+
#
|
89
|
+
# @param [Array] list of terms to count
|
90
|
+
#
|
91
|
+
# @return [Hash] term-frequency map
|
92
|
+
#
|
93
|
+
# @private
|
94
|
+
def frequency(values)
|
95
|
+
values.inject(Hash.new) do |h, term|
|
96
|
+
h[term] = (h[term].nil?) ? (1) : (h[term] + 1)
|
97
|
+
h
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# @see Strabo::Indexer#keywords
|
102
|
+
#
|
103
|
+
# @private
|
104
|
+
def flatten_keyword_map(map)
|
105
|
+
h = {}
|
106
|
+
map.each do |att, terms|
|
107
|
+
terms.each do |term, frequency|
|
108
|
+
h[term] = (h[term] || 0) + frequency
|
109
|
+
end
|
110
|
+
end
|
111
|
+
h
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
data/readme.markdown
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Strabo
|
2
|
+
|
3
|
+
## About
|
4
|
+
|
5
|
+
Strabo makes preparing a Ruby object for full text search by tokenizing an objects attributes.
|
6
|
+
|
7
|
+
class Book < Hash
|
8
|
+
include Strabo::Indexer
|
9
|
+
end
|
10
|
+
|
11
|
+
book = Book.new
|
12
|
+
book['title'] = 'Learn to Program'
|
13
|
+
book['author'] = 'Chris Pine'
|
14
|
+
book.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: strabo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jon Morton
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-03-28 00:00:00 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Simplified tokenization, stemming, and term-frequency map indexes
|
17
|
+
email: "jon.morton@gmail.com "
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- .gitignore
|
26
|
+
- Rakefile
|
27
|
+
- VERSION
|
28
|
+
- examples/book.rb
|
29
|
+
- features/index.feature
|
30
|
+
- features/stemming.feature
|
31
|
+
- features/steps/index_steps.rb
|
32
|
+
- features/steps/stemmer_steps.rb
|
33
|
+
- features/support/env.rb
|
34
|
+
- lib/strabo.rb
|
35
|
+
- readme.markdown
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://github.com/jmorton/strabo
|
38
|
+
licenses: []
|
39
|
+
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options:
|
42
|
+
- --charset=UTF-8
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
version:
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.3.5
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Full text search utilities for Ruby
|
64
|
+
test_files:
|
65
|
+
- examples/book.rb
|