strabo 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +10 -0
- data/Rakefile +36 -0
- data/VERSION +1 -0
- data/examples/book.rb +10 -0
- data/features/index.feature +64 -0
- data/features/stemming.feature +23 -0
- data/features/steps/index_steps.rb +37 -0
- data/features/steps/stemmer_steps.rb +6 -0
- data/features/support/env.rb +5 -0
- data/lib/strabo.rb +115 -0
- data/readme.markdown +14 -0
- metadata +65 -0
data/.gitignore
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gemspec|
|
7
|
+
gemspec.name = "strabo"
|
8
|
+
gemspec.summary = "Full text search utilities for Ruby"
|
9
|
+
gemspec.description = "Simplified tokenization, stemming, and term-frequency map indexes"
|
10
|
+
gemspec.email = "jon.morton@gmail.com "
|
11
|
+
gemspec.homepage = "http://github.com/jmorton/strabo"
|
12
|
+
gemspec.authors = ["Jon Morton"]
|
13
|
+
end
|
14
|
+
Jeweler::GemcutterTasks.new
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
begin
|
20
|
+
require 'YARD'
|
21
|
+
YARD::Rake::YardocTask.new do |t|
|
22
|
+
t.files = ['lib/**/*.rb']
|
23
|
+
end
|
24
|
+
rescue LoadError
|
25
|
+
puts "Yard not available. Install it with: gem install yard"
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'cucumber'
|
30
|
+
require 'cucumber/rake/task'
|
31
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
32
|
+
t.cucumber_opts = "features --format pretty"
|
33
|
+
end
|
34
|
+
rescue LoadError
|
35
|
+
puts "Cucumber not available. Install it with: gem install cucumber"
|
36
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/examples/book.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
Feature: Index a document
|
2
|
+
In order to make text searchable
|
3
|
+
A document's contents should be indexed
|
4
|
+
|
5
|
+
Background:
|
6
|
+
Given hash includes indexer
|
7
|
+
|
8
|
+
Scenario: Indexing
|
9
|
+
Given a hash with
|
10
|
+
| title | Strabo's Geographica |
|
11
|
+
| author | Strabo of Amaseia |
|
12
|
+
| summary | Strabo was born to an affluent family from Amaseia in Pontus |
|
13
|
+
Then it should have the following "title" term frequencies:
|
14
|
+
| term | frequency |
|
15
|
+
| geographica | 1 |
|
16
|
+
Then it should have the following "author" term frequencies:
|
17
|
+
| term | frequency |
|
18
|
+
| strabo | 1 |
|
19
|
+
| of | 1 |
|
20
|
+
| amaseia | 1 |
|
21
|
+
Then it should have the following "summary" term frequencies:
|
22
|
+
| term | frequency |
|
23
|
+
| strabo | 1 |
|
24
|
+
| was | 1 |
|
25
|
+
| born | 1 |
|
26
|
+
| to | 1 |
|
27
|
+
| an | 1 |
|
28
|
+
| affluent | 1 |
|
29
|
+
Then it should have the following term frequencies:
|
30
|
+
| term | frequency |
|
31
|
+
| strabo | 2 |
|
32
|
+
| amaseia | 2 |
|
33
|
+
| geographica | 1 |
|
34
|
+
|
35
|
+
Scenario: Abstract indexing
|
36
|
+
Given a hash with
|
37
|
+
| foo | x y x y x y |
|
38
|
+
| bar | x y z x y z |
|
39
|
+
| baz | a b c x y z |
|
40
|
+
Then it should have the following "foo" term frequencies:
|
41
|
+
| T | F |
|
42
|
+
| x | 3 |
|
43
|
+
| x | 3 |
|
44
|
+
Then it should have the following "bar" term frequencies:
|
45
|
+
| T | F |
|
46
|
+
| x | 2 |
|
47
|
+
| y | 2 |
|
48
|
+
| z | 2 |
|
49
|
+
Then it should have the following "baz" term frequencies:
|
50
|
+
| T | F |
|
51
|
+
| x | 1 |
|
52
|
+
| y | 1 |
|
53
|
+
| z | 1 |
|
54
|
+
| a | 1 |
|
55
|
+
| b | 1 |
|
56
|
+
| c | 1 |
|
57
|
+
Then it should have the following term frequencies:
|
58
|
+
| T | F |
|
59
|
+
| x | 6 |
|
60
|
+
| y | 6 |
|
61
|
+
| z | 3 |
|
62
|
+
| a | 1 |
|
63
|
+
| b | 1 |
|
64
|
+
| c | 1 |
|
@@ -0,0 +1,23 @@
|
|
1
|
+
Feature: Stem tokens
|
2
|
+
In order to increase recall
|
3
|
+
A document's tokens may be stemmed
|
4
|
+
|
5
|
+
Background:
|
6
|
+
Given hash includes indexer
|
7
|
+
And stemming uses the "ruby-stemmer" gem
|
8
|
+
|
9
|
+
Scenario: Stemming tokens
|
10
|
+
Given a hash with
|
11
|
+
| foo | jump jumps jumping jumper |
|
12
|
+
| bar | ran run running runner |
|
13
|
+
Then it should have the following "foo" term frequencies:
|
14
|
+
| T | F |
|
15
|
+
| jump | 3 |
|
16
|
+
| jumper | 1 |
|
17
|
+
Then it should have the following "bar" term frequencies:
|
18
|
+
| T | F |
|
19
|
+
| run | 2 |
|
20
|
+
| ran | 1 |
|
21
|
+
| runner | 1 |
|
22
|
+
|
23
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
Given 'hash includes indexer' do
|
2
|
+
class Hash
|
3
|
+
include Strabo::Indexer
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
Given /a hash with/ do |table|
|
8
|
+
@context = {}
|
9
|
+
table.rows_hash.each do |keys, values|
|
10
|
+
@context[keys] = values
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /it should have the following term frequencies:/ do |table|
|
15
|
+
begin
|
16
|
+
table.map_headers!('T' => 'term', 'F' => 'frequency')
|
17
|
+
rescue
|
18
|
+
# no big deal
|
19
|
+
end
|
20
|
+
|
21
|
+
table.hashes.each do |row|
|
22
|
+
@context.keywords(true)[row['term']].should eql(row['frequency'].to_i)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
Then /it should have the following "(.+)" term frequencies:/ do |attribute, table|
|
27
|
+
begin
|
28
|
+
table.map_headers!('T' => 'term', 'F' => 'frequency')
|
29
|
+
rescue
|
30
|
+
# no big deal
|
31
|
+
end
|
32
|
+
|
33
|
+
table.hashes.each do |row|
|
34
|
+
@context.keywords[attribute].keys.should include(row['term'])
|
35
|
+
@context.keywords[attribute][row['term']].should eql(row['frequency'].to_i)
|
36
|
+
end
|
37
|
+
end
|
data/lib/strabo.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'Set'
|
2
|
+
|
3
|
+
# Strabo assists full text search indexing by generating term-frequency maps
|
4
|
+
# for an object's attributes. The term-frequency map may be flattened into
|
5
|
+
# an index for the entire object.
|
6
|
+
#
|
7
|
+
# Strabo was written with MongoDB in mind. The idea is that a document will
|
8
|
+
# store its own embedded keyword index that MongoDB can use for full text
|
9
|
+
# search.
|
10
|
+
#
|
11
|
+
# @example: Using strabo
|
12
|
+
# class Book < Hash
|
13
|
+
# include Strabo
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# book = Book.new
|
17
|
+
# book['title'] = 'Learn to Program'
|
18
|
+
# book['author'] = 'Chris Pine'
|
19
|
+
# b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
|
20
|
+
#
|
21
|
+
# @author: Jon Morton
|
22
|
+
#
|
23
|
+
module Strabo
|
24
|
+
|
25
|
+
# Stemming configuration. By default, Strabo performs no stemming.
|
26
|
+
#
|
27
|
+
# @example: Configuring stemming
|
28
|
+
# require 'rubygems'
|
29
|
+
# require 'lingua/stemmer'
|
30
|
+
# Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
|
31
|
+
#
|
32
|
+
# @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github
|
33
|
+
#
|
34
|
+
module Stemmer
|
35
|
+
|
36
|
+
# Set the stemmer used during tokenization.
|
37
|
+
#
|
38
|
+
# @param [lambda] stemmer called with individual tokens
|
39
|
+
#
|
40
|
+
# @see Strabo#stem
|
41
|
+
def self.stemmer=(stemmer)
|
42
|
+
@stemmer = stemmer
|
43
|
+
end
|
44
|
+
|
45
|
+
# Invokes stemmer on token. If no stemmer has been configured, it will
|
46
|
+
# return the original token.
|
47
|
+
#
|
48
|
+
# @param [String] token
|
49
|
+
#
|
50
|
+
# @return [String] result of stemming
|
51
|
+
#
|
52
|
+
# @see Strabo#stemmer
|
53
|
+
def self.stem(token)
|
54
|
+
@stemmer.nil? ? token : @stemmer.call(token)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
module Indexer
|
59
|
+
|
60
|
+
# Get attribute-term-frequency map. If flattened, a term-frequency map
|
61
|
+
# without the context of the attribute.
|
62
|
+
#
|
63
|
+
# @param [TrueClass, FalseClass] flatten
|
64
|
+
#
|
65
|
+
# @return [Hash] { attribute => { term => frequency } } or
|
66
|
+
# { term => frequency } map.
|
67
|
+
def keywords(flatten = false)
|
68
|
+
@term_map = {}
|
69
|
+
self.each { |key, value| @term_map[key] = frequency(tokenize(value)) }
|
70
|
+
flatten ? flatten_keyword_map(@term_map) : @term_map
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
# Break a string into a list of strings.
|
76
|
+
#
|
77
|
+
# @param [String] text to convert into a list
|
78
|
+
# @param [Regex] delimiter used to scan the string
|
79
|
+
#
|
80
|
+
# @return [Array] list of stemmed terms
|
81
|
+
#
|
82
|
+
# @private
|
83
|
+
def tokenize(value, delimiter = /\S+/)
|
84
|
+
value.downcase.scan(delimiter).map { |token| Strabo::Stemmer.stem(token) }
|
85
|
+
end
|
86
|
+
|
87
|
+
# Tally the number of occurrences of a value in a list.
|
88
|
+
#
|
89
|
+
# @param [Array] list of terms to count
|
90
|
+
#
|
91
|
+
# @return [Hash] term-frequency map
|
92
|
+
#
|
93
|
+
# @private
|
94
|
+
def frequency(values)
|
95
|
+
values.inject(Hash.new) do |h, term|
|
96
|
+
h[term] = (h[term].nil?) ? (1) : (h[term] + 1)
|
97
|
+
h
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# @see Strabo::Indexer#keywords
|
102
|
+
#
|
103
|
+
# @private
|
104
|
+
def flatten_keyword_map(map)
|
105
|
+
h = {}
|
106
|
+
map.each do |att, terms|
|
107
|
+
terms.each do |term, frequency|
|
108
|
+
h[term] = (h[term] || 0) + frequency
|
109
|
+
end
|
110
|
+
end
|
111
|
+
h
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
data/readme.markdown
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Strabo
|
2
|
+
|
3
|
+
## About
|
4
|
+
|
5
|
+
Strabo makes preparing a Ruby object for full text search by tokenizing an objects attributes.
|
6
|
+
|
7
|
+
class Book < Hash
|
8
|
+
include Strabo::Indexer
|
9
|
+
end
|
10
|
+
|
11
|
+
book = Book.new
|
12
|
+
book['title'] = 'Learn to Program'
|
13
|
+
book['author'] = 'Chris Pine'
|
14
|
+
book.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: strabo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jon Morton
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-03-28 00:00:00 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Simplified tokenization, stemming, and term-frequency map indexes
|
17
|
+
email: "jon.morton@gmail.com "
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- .gitignore
|
26
|
+
- Rakefile
|
27
|
+
- VERSION
|
28
|
+
- examples/book.rb
|
29
|
+
- features/index.feature
|
30
|
+
- features/stemming.feature
|
31
|
+
- features/steps/index_steps.rb
|
32
|
+
- features/steps/stemmer_steps.rb
|
33
|
+
- features/support/env.rb
|
34
|
+
- lib/strabo.rb
|
35
|
+
- readme.markdown
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://github.com/jmorton/strabo
|
38
|
+
licenses: []
|
39
|
+
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options:
|
42
|
+
- --charset=UTF-8
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
version:
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.3.5
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Full text search utilities for Ruby
|
64
|
+
test_files:
|
65
|
+
- examples/book.rb
|