harmonious_dictionary 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/CHANGELOG +3 -0
- data/MIT-LICENSE +20 -0
- data/README.markdown +53 -0
- data/Rakefile +8 -0
- data/benchmark/benchmark.rb +43 -0
- data/benchmark/text_test_100.txt +4 -0
- data/benchmark/text_test_1000.txt +25 -0
- data/benchmark/text_test_10000.txt +219 -0
- data/bin/harmonious_rseg +11 -0
- data/bin/harmonious_server +63 -0
- data/harmonious_dictionary.gemspec +20 -0
- data/lib/generators/harmonious_dictionary/setup/setup_generator.rb +16 -0
- data/lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt +0 -0
- data/lib/generators/harmonious_dictionary/setup/templates/remote_server.yml +8 -0
- data/lib/harmonious_dictionary.rb +48 -0
- data/lib/harmonious_dictionary/app.rb +18 -0
- data/lib/harmonious_dictionary/engines/dict.rb +51 -0
- data/lib/harmonious_dictionary/engines/engine.rb +21 -0
- data/lib/harmonious_dictionary/engines/english.rb +27 -0
- data/lib/harmonious_dictionary/filters/conjunction.rb +11 -0
- data/lib/harmonious_dictionary/filters/fullwidth.rb +21 -0
- data/lib/harmonious_dictionary/filters/symbol.rb +16 -0
- data/lib/harmonious_dictionary/model_additions.rb +15 -0
- data/lib/harmonious_dictionary/railtie.rb +23 -0
- data/lib/harmonious_dictionary/rseg.rb +170 -0
- data/lib/harmonious_dictionary/version.rb +3 -0
- data/lib/tasks/generate_dictionary.rake +55 -0
- data/spec/harmonious_dictionary_spec.rb +40 -0
- data/spec/model_additions_spec.rb +57 -0
- data/spec/spec_helper.rb +21 -0
- metadata +99 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
namespace :harmonious_dictionary do
|
4
|
+
desc "generate harmonious dictionary for use"
|
5
|
+
task :generate => :environment do
|
6
|
+
chinese_dictionary_path = File.join(Rails.root, 'config','harmonious_dictionary','chinese_dictionary.txt')
|
7
|
+
english_dictionary_path = File.join(Rails.root, 'config','harmonious_dictionary','english_dictionary.txt')
|
8
|
+
|
9
|
+
puts "Processing chinese words..."
|
10
|
+
tree = {}
|
11
|
+
process(chinese_dictionary_path, tree)
|
12
|
+
File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
|
13
|
+
puts 'Done'
|
14
|
+
|
15
|
+
puts 'Processing english words...'
|
16
|
+
english_dictionary = []
|
17
|
+
process_english_words(english_dictionary_path,english_dictionary)
|
18
|
+
File.open(yaml_path, "wb") {|io| YAML::dump(english_dictionary, io)}
|
19
|
+
puts 'Done'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def process_english_words(path,list)
|
24
|
+
File.open(path, 'r') do |file|
|
25
|
+
file.each_line{|line| list << line.gsub!("\n",'') }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def process(path, tree)
|
30
|
+
File.open(path, 'r') do |file|
|
31
|
+
file.each_line do |line|
|
32
|
+
node = nil
|
33
|
+
line.chars.each do |c|
|
34
|
+
next if c == "\n" || c == "\r"
|
35
|
+
if node
|
36
|
+
node[c] ||= {}
|
37
|
+
node = node[c]
|
38
|
+
else
|
39
|
+
tree[c] ||= Hash.new
|
40
|
+
node = tree[c]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
node[:end] = true
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def hash_path
|
49
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious.hash')
|
50
|
+
end
|
51
|
+
|
52
|
+
def yaml_path
|
53
|
+
File.join(Rails.root, 'config','harmonious_dictionary','harmonious_english.yml')
|
54
|
+
end
|
55
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe HarmoniousDictionary do
|
6
|
+
|
7
|
+
describe 'local' do
|
8
|
+
describe 'segment' do
|
9
|
+
it 'should return harmonious word for sentence' do
|
10
|
+
HarmoniousDictionary.harmonious_words('戴秉国在中国').should == ['戴秉国']
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should return english,url and chiese words' do
|
14
|
+
HarmoniousDictionary.harmonious_words('戴秉国 in china,watch cctv.com.let fuck it').should == ['戴秉国','fuck']
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should find harmonious chinese words' do
|
19
|
+
HarmoniousDictionary.clean?('李源潮在中国').should == false
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should pass good words' do
|
23
|
+
HarmoniousDictionary.clean?('过去').should == true
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should clean sentence by replace harmonious words by *' do
|
27
|
+
HarmoniousDictionary.clean('戴秉国在中国').should == '***在中国'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should replace harmonious with *' do
|
31
|
+
HarmoniousDictionary.clean_word_basic('大米').should == '**'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# describe 'use remote' do
|
36
|
+
# it 'should use remote server for segment' do
|
37
|
+
# HarmoniousDictionary.clean_by_remote('戴秉国在中国').should == '***在中国'
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
ActiveRecord::Base.establish_connection(:adapter => "sqlite3", :database => ":memory:")
|
5
|
+
|
6
|
+
ActiveRecord::Schema.define(:version => 1) do
|
7
|
+
create_table :posts do |t|
|
8
|
+
t.string :title
|
9
|
+
t.text :body
|
10
|
+
t.text :note
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Post < ActiveRecord::Base
|
15
|
+
extend HarmoniousDictionary::ModelAdditions
|
16
|
+
validate_harmonious_of :title,:body
|
17
|
+
end
|
18
|
+
|
19
|
+
describe HarmoniousDictionary::ModelAdditions do
|
20
|
+
let(:post) { @post = Post.create title:'戴秉国在中国',body:'戴秉国在中国',note:'戴秉国在中国' }
|
21
|
+
|
22
|
+
describe 'use local' do
|
23
|
+
it 'should validate for harmonious' do
|
24
|
+
post.errors[:title].should == ['不能含有敏感词']
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should have error on title' do
|
28
|
+
post.errors[:body].should == ['不能含有敏感词']
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should allow empty input value' do
|
32
|
+
p = Post.create body:'戴秉国在中国',note:'戴秉国在中国'
|
33
|
+
post.errors[:body].should == ['不能含有敏感词']
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should filter! any harmonious words' do
|
37
|
+
HarmoniousDictionary.clean(post.body).should == '***在中国'
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# 以后再开放
|
42
|
+
# describe 'use remote' do
|
43
|
+
# before(:each) do
|
44
|
+
# configuration = double("configuration")
|
45
|
+
# @double_harmonious_dictionary = double('harmonious_dictionary')
|
46
|
+
# configuration.stub(:harmonious_dictionary){ @double_harmonious_dictionary }
|
47
|
+
# @double_harmonious_dictionary.stub(:use_remote_server){ true }
|
48
|
+
# Rails.stub(:configuration){configuration}
|
49
|
+
# end
|
50
|
+
|
51
|
+
# it 'should validate for harmonious' do
|
52
|
+
# HarmoniousDictionary.should_receive(:clean_by_remote?)
|
53
|
+
# post.errors[:title].should == ['不能含有敏感词']
|
54
|
+
# end
|
55
|
+
# end
|
56
|
+
end
|
57
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'rails'
|
4
|
+
require 'active_model'
|
5
|
+
require 'active_record'
|
6
|
+
require 'harmonious_dictionary'
|
7
|
+
|
8
|
+
RSpec.configure do |config|
|
9
|
+
config.color_enabled = true
|
10
|
+
config.formatter = 'documentation'
|
11
|
+
end
|
12
|
+
|
13
|
+
module Rails
|
14
|
+
def self.root
|
15
|
+
File.join File.dirname(__FILE__),'../'
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.env
|
19
|
+
'test'
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: harmonious_dictionary
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Stephen Kong
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-03 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: 和谐宝典用于检查输入是否包含中文或英文敏感词,并可替换为特殊字符。速度比常规的正则匹配要快10倍以上。生活在天朝,和谐宝典必须人手必备。
|
31
|
+
email:
|
32
|
+
- wear63659220@gmail.com
|
33
|
+
executables:
|
34
|
+
- harmonious_rseg
|
35
|
+
- harmonious_server
|
36
|
+
extensions: []
|
37
|
+
extra_rdoc_files: []
|
38
|
+
files:
|
39
|
+
- .gitignore
|
40
|
+
- .rspec
|
41
|
+
- CHANGELOG
|
42
|
+
- MIT-LICENSE
|
43
|
+
- README.markdown
|
44
|
+
- Rakefile
|
45
|
+
- benchmark/benchmark.rb
|
46
|
+
- benchmark/text_test_100.txt
|
47
|
+
- benchmark/text_test_1000.txt
|
48
|
+
- benchmark/text_test_10000.txt
|
49
|
+
- bin/harmonious_rseg
|
50
|
+
- bin/harmonious_server
|
51
|
+
- harmonious_dictionary.gemspec
|
52
|
+
- lib/generators/harmonious_dictionary/setup/setup_generator.rb
|
53
|
+
- lib/generators/harmonious_dictionary/setup/templates/chinese_dictionary.txt
|
54
|
+
- lib/generators/harmonious_dictionary/setup/templates/english_dictionary.txt
|
55
|
+
- lib/generators/harmonious_dictionary/setup/templates/remote_server.yml
|
56
|
+
- lib/harmonious_dictionary.rb
|
57
|
+
- lib/harmonious_dictionary/app.rb
|
58
|
+
- lib/harmonious_dictionary/engines/dict.rb
|
59
|
+
- lib/harmonious_dictionary/engines/engine.rb
|
60
|
+
- lib/harmonious_dictionary/engines/english.rb
|
61
|
+
- lib/harmonious_dictionary/filters/conjunction.rb
|
62
|
+
- lib/harmonious_dictionary/filters/fullwidth.rb
|
63
|
+
- lib/harmonious_dictionary/filters/symbol.rb
|
64
|
+
- lib/harmonious_dictionary/model_additions.rb
|
65
|
+
- lib/harmonious_dictionary/railtie.rb
|
66
|
+
- lib/harmonious_dictionary/rseg.rb
|
67
|
+
- lib/harmonious_dictionary/version.rb
|
68
|
+
- lib/tasks/generate_dictionary.rake
|
69
|
+
- spec/harmonious_dictionary_spec.rb
|
70
|
+
- spec/model_additions_spec.rb
|
71
|
+
- spec/spec_helper.rb
|
72
|
+
homepage: https://github.com/wear/harmonious_dictionary
|
73
|
+
licenses: []
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
none: false
|
80
|
+
requirements:
|
81
|
+
- - ! '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ! '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubyforge_project:
|
92
|
+
rubygems_version: 1.8.24
|
93
|
+
signing_key:
|
94
|
+
specification_version: 3
|
95
|
+
summary: filter any words that need to be harmonized
|
96
|
+
test_files:
|
97
|
+
- spec/harmonious_dictionary_spec.rb
|
98
|
+
- spec/model_additions_spec.rb
|
99
|
+
- spec/spec_helper.rb
|