cassiopee 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/Changelog CHANGED
@@ -1,3 +1,5 @@
1
+ v0.1.3 : 09/11 Olivier Sallou
2
+ add CrawlerMT in cassiopee-mt for multi thread support to speed up the search
1
3
  v0.1.2 : 09/11 Olivier Sallou
2
4
  add possibility to reload an "index" instead of using index method again
3
5
  fix comment mngt (comments attribute)
data/demo-mt.rb ADDED
@@ -0,0 +1,28 @@
1
+ require File.join(File.dirname(__FILE__), 'lib/cassiopee-mt')
2
+ require 'rubygems'
3
+ require 'logger'
4
+
5
+ # Instanciate a new crawler
6
+ crawler = CassiopeeMt::CrawlerMt.new
7
+ crawler.setLogLevel(Logger::INFO)
8
+ crawler.maxthread=3
9
+ #crawler.use_store = true
10
+
11
+ # String to index
12
+ crawler.indexString('iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiisallou salluiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii')
13
+ # Search pattern in indexed string
14
+ crawler.searchExact('llo')
15
+
16
+ # Go through matches
17
+ while((match = crawler.next())!=nil)
18
+ puts "got an exact match " << match.inspect
19
+ end
20
+
21
+ crawler.clear()
22
+
23
+ crawler.searchApproximate('llo',1)
24
+
25
+ # Go through matches
26
+ while((match = crawler.next())!=nil)
27
+ puts "got an approximate match " << match.inspect
28
+ end
@@ -0,0 +1,79 @@
1
+ require 'digest/md5'
2
+ require 'logger'
3
+ require 'zlib'
4
+ #require 'rubygems'
5
+ #require 'text'
6
+ #require 'text/util'
7
+ require File.join(File.dirname(__FILE__), 'cassiopee')
8
+
9
+ include Cassiopee
10
+
11
+ # Module managing multi threads to search in strings, extending Cassiopee
12
+ module CassiopeeMt
13
+
14
+ # Multi threaded search using a Crawler per thread
15
+ # Filtering is used to split the input data according to maxthread
16
+ # Matches of each thread are merge to matches of CrawlerMT
17
+ class CrawlerMt < Crawler
18
+
19
+ # Max number fo threads to use
20
+ attr_accessor :maxthread
21
+
22
+ @th = []
23
+
24
+ def initialize
25
+ super
26
+ @th = []
27
+ @matches = Array.new
28
+ end
29
+
30
+
31
+ def setParams(crawler,threadId)
32
+ crawler.setLogLevel($log.level)
33
+ crawler.file_suffix = @file_suffix
34
+ crawler.loadIndex()
35
+ #crawler.file_suffix = @file_suffix+"."+threadId.to_s
36
+ crawler.indexString(@sequence)
37
+ end
38
+
39
+ def searchExact(pattern)
40
+ nb = @sequence.length.div(maxthread)
41
+ min = 0
42
+ (1..maxthread).each do |i|
43
+ crawler = Crawler.new
44
+ setParams(crawler,i)
45
+ max = min + nb
46
+ if(i==maxthread)
47
+ max = @sequence.length
48
+ end
49
+ crawler.filter_position(min,max)
50
+ $log.debug("Start new Thread between " << min.to_s << " and " << max.to_s)
51
+ @th[i-1] = Thread.new{ Thread.current["matches"] = crawler.searchExact(pattern) }
52
+ min = max + 1
53
+ end
54
+ @th.each {|t| t.join; t["matches"].each { |m| @matches << m }}
55
+ return @matches
56
+ end
57
+
58
+ def searchApproximate(s,edit)
59
+ nb = @sequence.length.div(maxthread)
60
+ min = 0
61
+ (1..maxthread).each do |i|
62
+ crawler = Crawler.new
63
+ setParams(crawler,i)
64
+ max = min + nb
65
+ if(i==maxthread)
66
+ max = @sequence.length
67
+ end
68
+ crawler.filter_position(min,max)
69
+ $log.debug("Start new Thread between " << min.to_s << " and " << max.to_s)
70
+ @th[i-1] = Thread.new{ Thread.current["matches"] = crawler.searchApproximate(s,edit) }
71
+ min = max + 1
72
+ end
73
+ @th.each {|t| t.join; t["matches"].each { |m| @matches << m }}
74
+ return @matches
75
+ end
76
+
77
+ end
78
+
79
+ end
data/lib/cassiopee.rb CHANGED
@@ -136,15 +136,14 @@ module Cassiopee
136
136
  # * ambigous is a Hash of char/Array of char mapping
137
137
 
138
138
  def isAmbiguousEqual(a,b,ambiguous)
139
- if(ambiguous==nil || ambiguous[a.chr]==nil)
139
+ if(ambiguous==nil || (ambiguous[a.chr]==nil && ambiguous[b.chr]==nil ))
140
140
  if(a==b)
141
141
  return true
142
142
  else
143
143
  return false
144
144
  end
145
145
  end
146
- vin = "" << a.chr
147
- if(ambiguous[a.chr].index(b.chr)!=nil)
146
+ if(ambiguous[a.chr].index(b.chr)!=nil || ambiguous[b.chr].index(a.chr)!=nil || a==b)
148
147
  return true
149
148
  else
150
149
  return false
@@ -214,6 +213,7 @@ module Cassiopee
214
213
 
215
214
  def clear
216
215
  @suffixes = Hash.new
216
+ @matches = Array.new
217
217
  File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
218
218
  end
219
219
 
data/tests/test-suite.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require File.join(File.dirname(__FILE__), '../lib/cassiopee')
2
+ require File.join(File.dirname(__FILE__), '../lib/cassiopee-mt')
2
3
  require 'rubygems'
3
4
  require 'logger'
4
5
  require 'test/unit'
@@ -40,6 +41,14 @@ class TestCrawler < Test::Unit::TestCase
40
41
  assert_equal(1,matches.length)
41
42
  end
42
43
 
44
+ def test_multithreadsearch
45
+ crawler = CassiopeeMt::CrawlerMt.new
46
+ crawler.maxthread=3
47
+ crawler.indexString('iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiimy sample exampleiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii')
48
+ matches = crawler.searchExact('exam')
49
+ assert_equal(1,matches.length)
50
+ end
51
+
43
52
  end
44
53
 
45
54
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassiopee
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 2
10
- version: 0.1.2
9
+ - 3
10
+ version: 0.1.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Olivier Sallou
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-09-09 00:00:00 +02:00
18
+ date: 2011-09-19 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -47,7 +47,9 @@ files:
47
47
  - Changelog
48
48
  - LICENSE
49
49
  - demo.rb
50
+ - demo-mt.rb
50
51
  - lib/cassiopee.rb
52
+ - lib/cassiopee-mt.rb
51
53
  - bin/cassie.rb
52
54
  - tests/test-suite.rb
53
55
  - tests/amb.map