ansj_seg 0.0.5-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 977533c05998577f3a417f3f074a31873b94dfec
4
+ data.tar.gz: 8bd5a644c16111c097d958ad5fc9349139b39379
5
+ SHA512:
6
+ metadata.gz: caa06fda8c4eec4468444ee3708a51eb0752b3e986f7258153969943245ff7117f1bedd337a1ba4e413dd7e8e8494b4b80e5af1441bec405b79d3d93a8b8ed20
7
+ data.tar.gz: 2b79703de604e3e725bb925c2f7be6d1d5076181199847077e0ffb50efee116b01965fd22ff35f91a8c07f20dda83ad8ceb8229cd11af1bb2fcf0f48470adb41
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
@@ -0,0 +1 @@
1
+ jruby-9.1.6.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ansj_seg-jruby.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Howl王
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,106 @@
1
+ # Ansj中文分词 for JRuby
2
+
3
+ ## 引用原项目摘要
4
+ > 这是一个基于 n-Gram + CRF + HMM 的中文分词的 Java 实现。
5
+
6
+ > 分词速度达到每秒钟大约 200万字左右(原作者 MacbookAir 下测试),准确率能达到 96% 以上
7
+
8
+ > 目前实现了 中文分词、中文姓名识别、用户自定义词典、关键字提取、自动摘要、关键字标记等功能。
9
+
10
+ > 可以应用到自然语言处理等方面,适用于对分词效果要求高的各种项目。
11
+
12
+ Read Ansj中文分词 docs for more details: http://nlpchina.github.io/ansj_seg
13
+
14
+ ## Installing
15
+
16
+ Add to your `Gemfile`:
17
+
18
+ ```ruby
19
+ gem 'ansj_seg'
20
+ ```
21
+
22
+ Then `bundle install`.
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install ansj_seg
27
+
28
+ ## Usage
29
+
30
+ ```ruby
31
+ require 'jrjackson' # 可选项, JRuby 下最快的 json 解析器, 需要提前:gem install jrjackson -V
32
+ require 'ansj_seg'
33
+
34
+ # 设置用户(默认)词典
35
+ AnsjSeg::Config::DIC['dic'] = '/Users/howl/Desktop/library/default.dic'
36
+ # 设置用户自定义词典,key 的规则是以 dic_ 为前缀
37
+ AnsjSeg::Config::DIC['dic_souhu'] = '/Users/howl/Desktop/library/souhu.dic'
38
+ # 设置CRF模型,用户自定义CRF模型时,key 的规则是以 crf_ 为前缀
39
+ AnsjSeg::Config::CRF['crf'] = '/Users/howl/Desktop/library/crf.model'
40
+ # 设置歧义词典
41
+ AnsjSeg::Config.ambiguityLibrary = '/Users/howl/Desktop/library/ambiguity.dic'
42
+
43
+ # 设置过滤器(过滤:词性为空或标点,是和的)
44
+ AnsjSeg.fitler.ignore(natures: ['null', 'w'], words: ['是', '的'])
45
+
46
+ text = "Ruby China,对!没错!这里就是 Ruby 社区,目前这里已经是国内最权威的 Ruby 社区,拥有国内所有资深的 Ruby 工程师。"
47
+ ```
48
+
49
+ ```ruby
50
+ # 分词
51
+ # 第二个参数可选::to, :nlp, :index 三种分词模式
52
+ text.to_a(:terms) # text.to_a(:terms, :nlp)
53
+ ```
54
+ >
55
+ ```ruby
56
+ [
57
+ {:name=>"ruby", :natureStr=>"en", :newWord=>false, :offe=>0, :realName=>"ruby"},
58
+ {:name=>"china", :natureStr=>"en", :newWord=>false, :offe=>5, :realName=>"china"},
59
+ {:name=>"对", :natureStr=>"p", :newWord=>false, :offe=>11, :realName=>"对"},
60
+ {:name=>"没错", :natureStr=>"v", :newWord=>false, :offe=>13, :realName=>"没错"},
61
+ {:name=>"这里", :natureStr=>"r", :newWord=>false, :offe=>16, :realName=>"这里"},
62
+ {:name=>"就", :natureStr=>"d", :newWord=>false, :offe=>18, :realName=>"就"},
63
+ {:name=>"ruby", :natureStr=>"en", :newWord=>false, :offe=>21, :realName=>"ruby"},
64
+ {:name=>"社区", :natureStr=>"n", :newWord=>false, :offe=>26, :realName=>"社区"},
65
+ {:name=>"目前", :natureStr=>"t", :newWord=>false, :offe=>29, :realName=>"目前"},
66
+ {:name=>"这", :natureStr=>"r", :newWord=>false, :offe=>31, :realName=>"这"},
67
+ {:name=>"里", :natureStr=>"f", :newWord=>false, :offe=>32, :realName=>"里"},
68
+ {:name=>"已经", :natureStr=>"d", :newWord=>false, :offe=>33, :realName=>"已经"},
69
+ {:name=>"国内", :natureStr=>"s", :newWord=>false, :offe=>36, :realName=>"国内"},
70
+ {:name=>"最", :natureStr=>"d", :newWord=>false, :offe=>38, :realName=>"最"},
71
+ {:name=>"权威", :natureStr=>"n", :newWord=>false, :offe=>39, :realName=>"权威"},
72
+ {:name=>"ruby", :natureStr=>"en", :newWord=>false, :offe=>43, :realName=>"ruby"},
73
+ {:name=>"社区", :natureStr=>"n", :newWord=>false, :offe=>48, :realName=>"社区"},
74
+ {:name=>"拥有", :natureStr=>"v", :newWord=>false, :offe=>51, :realName=>"拥有"},
75
+ {:name=>"国内", :natureStr=>"s", :newWord=>false, :offe=>53, :realName=>"国内"},
76
+ {:name=>"所有", :natureStr=>"b", :newWord=>false, :offe=>55, :realName=>"所有"},
77
+ {:name=>"资深", :natureStr=>"b", :newWord=>false, :offe=>57, :realName=>"资深"},
78
+ {:name=>"ruby", :natureStr=>"en", :newWord=>false, :offe=>61, :realName=>"ruby"},
79
+ {:name=>"工程师", :natureStr=>"n", :newWord=>false, :offe=>66, :realName=>"工程师"}
80
+ ]
81
+ ```
82
+
83
+ ```ruby
84
+ # 提取关键词
85
+ # 第二个参数定义分词个数,默认:20
86
+ text.to_a(:words) # text.to_a(:words, 5)
87
+ ```
88
+ >
89
+ ```ruby
90
+ [
91
+ {:freq=>2, :name=>"这里", :score=>16.315514814428745},
92
+ {:freq=>2, :name=>"社区", :score=>14.99970404519092},
93
+ {:freq=>2, :name=>"国内", :score=>13.684318222044968},
94
+ {:freq=>1, :name=>"目前", :score=>5.3946562994797125},
95
+ {:freq=>1, :name=>"已经", :score=>4.868333845606951},
96
+ {:freq=>1, :name=>"权威", :score=>4.078866889481741},
97
+ {:freq=>1, :name=>"所有", :score=>1.973668895869867},
98
+ {:freq=>1, :name=>"资深", :score=>1.7105130430872182},
99
+ {:freq=>1, :name=>"没错", :score=>1.4999705998354727},
100
+ {:freq=>1, :name=>"就是", :score=>1.3683942314288522},
101
+ {:freq=>1, :name=>"工程师", :score=>0.5263054050944183},
102
+ {:freq=>1, :name=>"拥有", :score=>0.4999901999451576}
103
+ ]
104
+ ```
105
+
106
+ *PS.* Built and tested on JRuby 9.1.6
@@ -0,0 +1,20 @@
1
+ #-*- mode: ruby -*-
2
+ require 'ruby-maven'
3
+
4
+ desc "Pack jar after compiling classes"
5
+ task :compile do
6
+ RubyMaven.exec('prepare-package')
7
+ end
8
+
9
+ desc "Clean build"
10
+ task :clean do
11
+ RubyMaven.exec('clean')
12
+ end
13
+
14
+ task :default => [ :compile ]
15
+
16
+ require 'rubygems/package_task'
17
+ Gem::PackageTask.new( eval File.read( 'ansj_seg.gemspec' ) ) do
18
+ desc 'Pack gem'
19
+ task :package => [:compile]
20
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'ansj_seg/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "ansj_seg"
9
+ spec.version = AnsjSeg::VERSION
10
+ spec.authors = ["Howl王"]
11
+ spec.email = ["mimosa@aliyun.com"]
12
+ spec.description = %q{AnsjSeg for jRuby}
13
+ spec.summary = %q{ansj_seg for jRuby}
14
+ spec.homepage = "https://github.com/NLPchina/ansj_seg"
15
+ spec.license = "MIT"
16
+ # important to get the jars installed
17
+ spec.platform = 'java'
18
+
19
+ spec.files = `git ls-files`.split($/)
20
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
21
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
22
+ spec.require_paths = ['lib']
23
+
24
+ spec.requirements << 'jar org.nlpcn, nlp-lang, 1.7'
25
+ spec.requirements << 'jar org.ansj, ansj_seg, 5.0.1'
26
+
27
+ spec.add_runtime_dependency 'ruby-maven', '~> 3.3', '>= 3.3.12'
28
+ spec.add_runtime_dependency 'jar-dependencies', '~> 0.3.5'
29
+ spec.add_runtime_dependency 'multi_json', '~> 1.12', '>= 1.12.1'
30
+ end
@@ -0,0 +1,3 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'lib/ansj_seg'
@@ -0,0 +1,93 @@
1
+ # -*- encoding: utf-8 -*-
2
+ unless RUBY_PLATFORM =~ /java/
3
+ puts 'This library is only compatible with a java-based ruby environment like JRuby.'
4
+ exit 255
5
+ end
6
+
7
+ require 'ansj_seg_jars'
8
+ require 'multi_json'
9
+
10
+ module AnsjSeg
11
+ def self.fitler
12
+ @fitler ||= Filter.new
13
+ end
14
+
15
+ class Filter < Java::OrgAnsjRecognitionImpl::FilterRecognition
16
+ def ignore(opts = { natures: ['null', 'w'], words: ['是', '的'] })
17
+ if opts[:natures].is_a?(Array) && !opts[:natures].empty?
18
+ self.insertStopNatures(*opts.delete(:natures))
19
+ end
20
+
21
+ if opts[:words].is_a?(Array) && !opts[:words].empty?
22
+ self.insertStopWords(opts.delete(:words))
23
+ end
24
+
25
+ if opts[:regex].is_a?(Array) && !opts[:regex].empty?
26
+ opts.delete(:regex).map { |r| self.insertStopRegex(r) }
27
+ end
28
+ end
29
+
30
+ def ignore!(opts = {})
31
+ self.ignore(opts)
32
+ self
33
+ end
34
+ end
35
+
36
+ class Config < Java::OrgAnsjUtil::MyStaticValue
37
+ end
38
+
39
+ module JSON
40
+ extend MultiJson
41
+
42
+ def self.trans(obj)
43
+ json = self.dump(obj).to_s
44
+ self.load(json, use_bigdecimal: false, symbolize_keys: true)
45
+ end
46
+ end
47
+
48
+ module String
49
+ include_package 'org.ansj.splitWord.analysis'
50
+
51
+ def to_a(pattern = nil, limit = 0)
52
+ case pattern.to_s
53
+ when /terms/
54
+ split_terms(limit)
55
+ when /words/
56
+ split_words(limit)
57
+ else
58
+ split(pattern, limit)
59
+ end
60
+ end
61
+ alias_method :拆, :to_a
62
+
63
+ private
64
+
65
+ def split_terms(mode = nil)
66
+ klass = case mode.to_s
67
+ when /to/
68
+ ToAnalysis
69
+ when /nlp/
70
+ NlpAnalysis
71
+ when /index/
72
+ IndexAnalysis
73
+ else
74
+ BaseAnalysis
75
+ end
76
+
77
+ JSON.trans klass.parse(self).recognition(AnsjSeg.fitler.ignore!).getTerms
78
+ end
79
+
80
+ def split_words(limit = 0)
81
+ limit = if limit.to_i == 0
82
+ 20
83
+ else
84
+ limit
85
+ end
86
+
87
+ JSON.trans Java::OrgAnsjAppKeyword.KeyWordComputer.new(limit).computeArticleTfidf(self)
88
+ end
89
+ end
90
+
91
+ end # AnsjSeg
92
+
93
+ String.send(:include, AnsjSeg::String)
@@ -0,0 +1,5 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module AnsjSeg
4
+ VERSION = '0.0.5'
5
+ end
@@ -0,0 +1,39 @@
1
+ package org.ansj;
2
+
3
+ import org.ansj.splitWord.analysis.ToAnalysis;
4
+ import org.ansj.splitWord.analysis.NlpAnalysis;
5
+ import org.ansj.app.keyword.KeyWordComputer;
6
+
7
+ public class SegApi {
8
+
9
+ /**
10
+ * 分词服务,默认 ToAnalysis 分词
11
+ * @param content 需要分词的文本
12
+ * @return 分词后的结果集
13
+ */
14
+ @Execute
15
+ public Object toSeg(String content){
16
+ return ToAnalysis.parse(content).getTerms();
17
+ }
18
+
19
+ /**
20
+ * 分词服务,默认 NlpAnalysis 分词
21
+ * @param content 需要分词的文本
22
+ * @return 分词后的结果集
23
+ */
24
+ @Execute
25
+ public Object nlpSeg(String content){
26
+ return NlpAnalysis.parse(content).getTerms();
27
+ }
28
+
29
+ /**
30
+ * 关键词抽取
31
+ * @param content 需要抽取关键词的文本
32
+ * @return 关键词的结果集
33
+ */
34
+ @Execute
35
+ public Object keyWord(String content){
36
+ return new KeyWordComputer(20).computeArticleTfidf(content);
37
+ }
38
+
39
+ }
@@ -0,0 +1,17 @@
1
+ <?xml version='1.0'?>
2
+ <settings>
3
+ <profiles>
4
+ <profile>
5
+ <id>ansj_seg</id>
6
+ <repositories>
7
+ <repository>
8
+ <id>mvn-repo</id>
9
+ <url>http://maven.nlpcn.org/</url>
10
+ </repository>
11
+ </repositories>
12
+ </profile>
13
+ </profiles>
14
+ <activeProfiles>
15
+ <activeProfile>ansj_seg</activeProfile>
16
+ </activeProfiles>
17
+ </settings>
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ansj_seg
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.5
5
+ platform: java
6
+ authors:
7
+ - Howl王
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-11-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '3.3'
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: 3.3.12
22
+ name: ruby-maven
23
+ prerelease: false
24
+ type: :runtime
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '3.3'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 3.3.12
33
+ - !ruby/object:Gem::Dependency
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - "~>"
37
+ - !ruby/object:Gem::Version
38
+ version: 0.3.5
39
+ name: jar-dependencies
40
+ prerelease: false
41
+ type: :runtime
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: 0.3.5
47
+ - !ruby/object:Gem::Dependency
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - "~>"
51
+ - !ruby/object:Gem::Version
52
+ version: '1.12'
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: 1.12.1
56
+ name: multi_json
57
+ prerelease: false
58
+ type: :runtime
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '1.12'
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 1.12.1
67
+ description: AnsjSeg for jRuby
68
+ email:
69
+ - mimosa@aliyun.com
70
+ executables: []
71
+ extensions: []
72
+ extra_rdoc_files: []
73
+ files:
74
+ - ".gitignore"
75
+ - ".ruby-version"
76
+ - Gemfile
77
+ - LICENSE.txt
78
+ - README.md
79
+ - Rakefile
80
+ - ansj_seg.gemspec
81
+ - ansj_seg.rb
82
+ - lib/ansj_seg.rb
83
+ - lib/ansj_seg/version.rb
84
+ - lib/src/main/java/org/ansj/SegApi.java
85
+ - settings.xml
86
+ homepage: https://github.com/NLPchina/ansj_seg
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements:
105
+ - jar org.nlpcn, nlp-lang, 1.7
106
+ - jar org.ansj, ansj_seg, 5.0.1
107
+ rubyforge_project:
108
+ rubygems_version: 2.6.8
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: ansj_seg for jRuby
112
+ test_files: []