Hearch 2018.8.28 → 2018.8.31
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/Hearch.rb +116 -23
- metadata +22 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 027d6569ecfe8136979a5e251cb8b47613613515
|
4
|
+
data.tar.gz: f5c63fa7936c14655806a227bac9810f973b5eda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7bccddf365d81d95fb52cef66f2ef98b50a276b1a35dd144423317d64167b441fa72f33611d032f86d982910b8a4ae3752fc74d04bf8d877ad87a73deeb510b1
|
7
|
+
data.tar.gz: 7faf5c16c64a253c8cb8fb8656fbb56933ede895b5a6f8928109591ee40883f90fa839cc7795ba2ebc1609212ad3f294f961bce3cad82ece552ca5ebe6040045
|
data/lib/Hearch.rb
CHANGED
@@ -1,40 +1,133 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
$LOAD_PATH.unshift File.expand_path('../Hearch', __FILE__)
|
4
|
+
|
5
|
+
# require 'Hearch/HearchIndexEntry_pb'
|
6
|
+
require 'Hearch/HearchIndex_pb'
|
7
|
+
require 'rmmseg'
|
8
|
+
|
9
|
+
include RMMSeg
|
5
10
|
|
6
11
|
class Hearch
|
7
12
|
def initialize
|
8
|
-
|
9
|
-
|
13
|
+
@index = nil #索引对象。
|
14
|
+
end
|
15
|
+
|
16
|
+
# Initialize an empty index.
|
17
|
+
#
|
18
|
+
# Example:
|
19
|
+
# >> hearch=Hearch.new
|
20
|
+
# >> hearch.initializeIndex
|
21
|
+
def initializeIndex
|
22
|
+
@index=Com::Stupidbeauty::Hearch::HearchIndex.new #索引库对象。
|
23
|
+
end
|
24
|
+
|
25
|
+
#分词获取关键字列表。
|
26
|
+
def getKeywordList(keywordsString)
|
27
|
+
resultList=[] #结果列表。
|
28
|
+
|
29
|
+
resultList=segment(keywordsString)
|
30
|
+
|
31
|
+
# resultList << keywordsString #最简单实现,直接将整个字符串加入。
|
32
|
+
|
33
|
+
resultList #返回结果。
|
34
|
+
end #def getKeywordList(keywordsString)
|
10
35
|
|
11
|
-
|
36
|
+
# Load index.
|
37
|
+
#
|
38
|
+
# Example:
|
39
|
+
# >> hearch=Hearch.new
|
40
|
+
# >> hearch.loadIndex('indexFile.pb')
|
41
|
+
# Arguments:
|
42
|
+
# indexFileName: (String)
|
43
|
+
def loadIndex(indexFileName)
|
44
|
+
fileContent=File.read(indexFileName) #读取文件内容。
|
45
|
+
|
46
|
+
#解析protobuf:
|
47
|
+
@index=Com::Stupidbeauty::Hearch::HearchIndex.decode(fileContent) #protobuf解码。
|
12
48
|
end
|
13
49
|
|
14
|
-
#
|
50
|
+
# Save index.
|
15
51
|
#
|
16
52
|
# Example:
|
17
|
-
# >>
|
18
|
-
# >>
|
53
|
+
# >> hearch=Hearch.new
|
54
|
+
# >> hearch.saveIndex('indexFile.pb')
|
55
|
+
# Arguments:
|
56
|
+
# indexFileName: (String)
|
57
|
+
def saveIndex(indexFileName)
|
58
|
+
manufacturerListEncoded="" #初始化要存储的缓冲区。
|
59
|
+
manufacturerListEncoded=Com::Stupidbeauty::Hearch::HearchIndex.encode(@index) #重新编码。
|
60
|
+
|
61
|
+
manufacturerListFile=File.open(indexFileName, 'w') #打开本地记录文件。
|
62
|
+
manufacturerListFile.write(manufacturerListEncoded) #写入到文件中。
|
63
|
+
manufacturerListFile.close #关闭文件。
|
64
|
+
end
|
65
|
+
|
66
|
+
# Search for articles in the index.
|
19
67
|
#
|
68
|
+
# Example:
|
69
|
+
# >> hearch=Hearch.new
|
70
|
+
# >> hearch.loadIndex('indexFile.pb')
|
71
|
+
# >> keywordString='something to search'
|
72
|
+
# >> resultList=hearch.search(keywordsString)
|
20
73
|
# Arguments:
|
21
|
-
#
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
74
|
+
# keywordsString: (String)
|
75
|
+
def search(keywordsString)
|
76
|
+
resultList=[] #结果列表。
|
77
|
+
keywordList=getKeywordList(keywordsString) #分词获取关键字列表。
|
78
|
+
|
79
|
+
@index.entry.each do |currentEntry| #一个个条目地比较。
|
80
|
+
keywordList.each do |currentKeyword| #一个个关键字地比较。
|
81
|
+
# puts "Current keyword: ", currentKeyword, ", current entry keyword: ", currentEntry.keyword #Debug.
|
82
|
+
if currentEntry.keyword==currentKeyword #关键字相同。
|
83
|
+
resultList << currentEntry.articleId #将文章编号加入到结果列表中。
|
84
|
+
|
85
|
+
break #跳出。
|
86
|
+
end #if currentEntry.keyword==currentKeyword #关键字相同。
|
87
|
+
end #keywordList.each do |currentKeyword| #一个个关键字地比较。
|
88
|
+
end #@index.entry.each do |currentEntry| #一个个条目地比较。
|
89
|
+
|
90
|
+
resultList #返回结果列表。
|
91
|
+
end
|
92
|
+
|
93
|
+
# Add an article into the index.
|
94
|
+
#
|
95
|
+
# Example:
|
96
|
+
# >> hearch=Hearch.new
|
97
|
+
# >> hearch.loadIndex('indexFile.pb')
|
98
|
+
# >> articleContentString='something to search'
|
99
|
+
# >> hearch.addArticle(articleContentString, 1)
|
100
|
+
# Arguments:
|
101
|
+
# articleContentString: (String)
|
102
|
+
# articleId: (int)
|
103
|
+
def addArticle(articleContentString, articleId)
|
104
|
+
keywordList=getKeywordList(articleContentString) #分词获取关键字列表。
|
30
105
|
|
31
|
-
|
32
|
-
|
33
|
-
|
106
|
+
#遍历关键字列表:
|
107
|
+
keywordList.each do |currentKeyword| #一个个关键字地添加。
|
108
|
+
|
109
|
+
existEntry=false #是否命中了已有条目。
|
34
110
|
|
35
|
-
|
111
|
+
@index.entry.each do |currentEntry| #一个个条目地比较。
|
112
|
+
if currentEntry.keyword==currentKeyword #关键字相同。
|
113
|
+
currentEntry.articleId << articleId #将文章编号加入到文章编号列表中。
|
114
|
+
|
115
|
+
existEntry=true #是命中了已有条目。
|
116
|
+
|
117
|
+
break #跳出。
|
118
|
+
end #if currentEntry.keyword==currentKeyword #关键字相同。
|
36
119
|
|
37
|
-
|
38
|
-
|
120
|
+
end #@index.entry.each do |currentEntry| #一个个条目地比较。
|
121
|
+
|
122
|
+
if (existEntry) #命中了已有条目。
|
123
|
+
else #未命中已有条目。
|
124
|
+
currentEntry=Com::Stupidbeauty::Hearch::HearchIndexEntry.new #索引库条目对象。
|
125
|
+
|
126
|
+
currentEntry.keyword=currentKeyword #设置关键字。
|
127
|
+
currentEntry.articleId << articleId #将文章编号加入到文章编号列表中。
|
128
|
+
|
129
|
+
@index.entry << currentEntry #加入条目列表中。
|
130
|
+
end #if (existEntry) #命中了已有条目。
|
131
|
+
end #keywordList.each do |currentKeyword| #一个个关键字地比较。
|
39
132
|
end
|
40
133
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: Hearch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2018.8.
|
4
|
+
version: 2018.8.31
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hxcan Cai
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: google-protobuf
|
@@ -44,6 +44,26 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '1.0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: plexus-rmmseg
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0.1'
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 0.1.6
|
57
|
+
type: :runtime
|
58
|
+
prerelease: false
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0.1'
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 0.1.6
|
47
67
|
description: Hearch ruby. Index and search.
|
48
68
|
email: caihuosheng@gmail.com
|
49
69
|
executables: []
|