yandex_mystem 0.1.1 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -7
- data/bin/mystem-bsd +0 -0
- data/bin/mystem-linux-32 +0 -0
- data/bin/mystem-linux-64 +0 -0
- data/bin/mystem-win.exe +0 -0
- data/lib/yandex_mystem/version.rb +1 -1
- data/lib/yandex_mystem.rb +13 -46
- data/spec/yandex_mystem_spec.rb +37 -25
- metadata +11 -8
- data/.rvmrc +0 -1
- data/app/mystem-linux-32 +0 -0
- data/app/mystem-linux-64 +0 -0
- data/app/mystem-mac +0 -0
- data/app/mystem-win.exe +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7c1c4fcbbbafd601e0457db5366a8d82639955d
|
4
|
+
data.tar.gz: 3da18f1e83a23747a51786fcda68b5ccd109dfb2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fdca784ffa0621fc345ffa080e00131c43db0ac86f14012082271ed378a1cfa32ff1145acfecb77ef2d336e8f6a8ed4e976375b81ed3fbedbfd94aaf7213ba97
|
7
|
+
data.tar.gz: 6fa8de5e45b069137e46165d6da1cd7d0209b297e85de996a30340d1171cf8d85eb1194787f0fa40bae12fbb4ccd92ecef0601462cfdc3e2e6664dbf9ff8f234
|
data/README.md
CHANGED
@@ -3,6 +3,12 @@
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/dmitry/yandex_mystem.png?branch=master)](http://travis-ci.org/dmitry/yandex_mystem) [![Gem Version](https://badge.fury.io/rb/yandex_mystem.png)](http://badge.fury.io/rb/yandex_mystem) [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dmitry/yandex_mystem/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
|
4
4
|
|
5
5
|
|
6
|
+
## Version 3.0 not compatible with previous
|
7
|
+
|
8
|
+
Mystem 3.0 support JSON format, so we should use this option.
|
9
|
+
This gem (yandex_mystem) now returns information in JSON-like ruby native format (array of hashes).
|
10
|
+
More info about Mystem changes: http://api.yandex.ru/mystem/downloads/
|
11
|
+
|
6
12
|
## Introduction
|
7
13
|
|
8
14
|
Mystem is a software that provided by the Yandex only for non-commercial project. With use of it you can detect base forms of the words in a text, make a simple morphological analysis of russian words.
|
@@ -13,7 +19,7 @@ Mystem is a software that provided by the Yandex only for non-commercial project
|
|
13
19
|
|
14
20
|
## License
|
15
21
|
|
16
|
-
First of all, read license on http://
|
22
|
+
First of all, read license on http://api.yandex.ru/mystem/
|
17
23
|
|
18
24
|
`Mystem` available only for non-commercial usage.
|
19
25
|
|
@@ -21,14 +27,14 @@ First of all, read license on http://company.yandex.ru/technologies/mystem/
|
|
21
27
|
|
22
28
|
This gem contains executables for there platforms:
|
23
29
|
|
24
|
-
* Windows
|
25
|
-
* Linux
|
26
|
-
* Linux
|
27
|
-
*
|
30
|
+
* Windows 7 32-bit
|
31
|
+
* Linux 3.5 32-bit
|
32
|
+
* Linux 3.1 64-bit
|
33
|
+
* Freebsd 9.0 64-bit
|
28
34
|
|
29
|
-
|
35
|
+
Note: Mystem 3.0 not support Mac OS X, sorry.
|
30
36
|
|
31
37
|
## Usage
|
32
38
|
|
33
39
|
YandexMystem::Simple.stem 'О предложении в котором много слов.'
|
34
|
-
YandexMystem::
|
40
|
+
YandexMystem::Raw.stem 'нет сов'
|
data/bin/mystem-bsd
ADDED
Binary file
|
data/bin/mystem-linux-32
ADDED
Binary file
|
data/bin/mystem-linux-64
ADDED
Binary file
|
data/bin/mystem-win.exe
ADDED
Binary file
|
data/lib/yandex_mystem.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'open3'
|
2
4
|
require 'yandex_mystem/version'
|
3
5
|
require 'pathname'
|
6
|
+
require 'json'
|
4
7
|
|
5
8
|
module YandexMystem
|
6
|
-
class Base
|
7
|
-
WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
|
9
|
+
class Base
|
8
10
|
|
9
11
|
def self.stem(text)
|
10
12
|
exec = [command, self::ARGUMENTS].join(' ')
|
@@ -20,7 +22,7 @@ module YandexMystem
|
|
20
22
|
|
21
23
|
def self.command
|
22
24
|
@command ||= begin
|
23
|
-
path = Pathname.new(__FILE__) + '../../
|
25
|
+
path = Pathname.new(__FILE__) + '../../bin/'
|
24
26
|
path + "mystem-#{command_postfix}"
|
25
27
|
end
|
26
28
|
end
|
@@ -34,9 +36,9 @@ module YandexMystem
|
|
34
36
|
when /64.+linux$/
|
35
37
|
'linux-64'
|
36
38
|
when /darwin/
|
37
|
-
'
|
39
|
+
raise 'Mystem 3.0 does not support Max OS X.'
|
38
40
|
when /freebsd/
|
39
|
-
|
41
|
+
'bsd'
|
40
42
|
else
|
41
43
|
raise 'Unknown OS'
|
42
44
|
end
|
@@ -44,54 +46,19 @@ module YandexMystem
|
|
44
46
|
end
|
45
47
|
|
46
48
|
class Simple < Base
|
47
|
-
ARGUMENTS = '-e utf-8 -n'
|
48
|
-
|
49
|
-
NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
|
49
|
+
ARGUMENTS = '-e utf-8 -n --format json'
|
50
50
|
|
51
51
|
|
52
52
|
def self.parse(data)
|
53
|
-
|
54
|
-
words = words.split('|').select do |w|
|
55
|
-
!(w =~ NOT_INCLUDE_REGEXP)
|
56
|
-
end
|
57
|
-
|
58
|
-
[word, words]
|
59
|
-
end.flatten(1)
|
60
|
-
|
61
|
-
Hash[*parsed]
|
53
|
+
Hash[ JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true).inject([]){|s, h| s + [[ h[:text], h[:analysis].map{|a| a[:lex]} ]]} ]
|
62
54
|
end
|
63
55
|
end
|
64
56
|
|
65
|
-
class
|
66
|
-
ARGUMENTS = '-e utf-8 -
|
67
|
-
|
68
|
-
REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
|
69
|
-
|
70
|
-
Word = Struct.new(:word, :frequency, :part)
|
57
|
+
class Raw < Base
|
58
|
+
ARGUMENTS = '-e utf-8 -ig -n --weight --format json --eng-gr'
|
71
59
|
|
72
60
|
def self.parse(data)
|
73
|
-
|
74
|
-
|
75
|
-
data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
|
76
|
-
unless parsed.key?(word)
|
77
|
-
words = words.scan(REGEXP).map do |w|
|
78
|
-
to_word(w)
|
79
|
-
end
|
80
|
-
|
81
|
-
unless words.size.zero?
|
82
|
-
parsed[word] = words.sort_by(&:frequency).reverse
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
parsed
|
88
|
-
end
|
89
|
-
|
90
|
-
private
|
91
|
-
|
92
|
-
def self.to_word(w)
|
93
|
-
word, frequency, part = w
|
94
|
-
Word.new(word, frequency.to_f, part)
|
95
|
-
end
|
61
|
+
JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true)
|
62
|
+
end
|
96
63
|
end
|
97
64
|
end
|
data/spec/yandex_mystem_spec.rb
CHANGED
@@ -4,7 +4,28 @@ require 'spec_helper'
|
|
4
4
|
describe YandexMystem do
|
5
5
|
context YandexMystem::Simple do
|
6
6
|
it "should stem words" do
|
7
|
-
data = YandexMystem::Simple.stem(
|
7
|
+
data = YandexMystem::Simple.stem("мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements")
|
8
|
+
data['мальчики'].should eq %w(мальчик)
|
9
|
+
data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
|
10
|
+
data['девочки'].should eq %w(девочка)
|
11
|
+
data['девочек'].should eq %w(девочка)
|
12
|
+
data['сов'].should eq %w(сова)
|
13
|
+
data['пошли'].should eq %w(пойти посылать)
|
14
|
+
data['elements'].should eq []
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should stem words in few lines" do
|
18
|
+
data = YandexMystem::Simple.stem(%[
|
19
|
+
|
20
|
+
|
21
|
+
мальчики
|
22
|
+
мальчиков
|
23
|
+
девочки девочек компьютеров компьютере сов пошли
|
24
|
+
elements
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
])
|
8
29
|
data['мальчики'].should eq %w(мальчик)
|
9
30
|
data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
|
10
31
|
data['девочки'].should eq %w(девочка)
|
@@ -15,41 +36,32 @@ describe YandexMystem do
|
|
15
36
|
end
|
16
37
|
end
|
17
38
|
|
18
|
-
context YandexMystem::
|
39
|
+
context YandexMystem::Raw do
|
19
40
|
it 'latin words' do
|
20
|
-
|
21
|
-
|
22
|
-
words.size.should eq 0
|
41
|
+
response = YandexMystem::Raw.stem('Elements')
|
23
42
|
|
24
|
-
|
43
|
+
response.size.should eq 1
|
44
|
+
response[0][:analysis].should eq []
|
45
|
+
response[0][:text].should eq 'Elements'
|
25
46
|
end
|
26
47
|
|
27
48
|
it 'multiple definitions' do
|
28
|
-
|
49
|
+
response = YandexMystem::Raw.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
|
29
50
|
|
30
|
-
|
51
|
+
response.size.should eq 11
|
31
52
|
|
32
|
-
|
33
|
-
|
34
|
-
|
53
|
+
response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:lex].should eq 'девочка'
|
54
|
+
response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:wt].should eq 1
|
55
|
+
response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:gr].should eq 'S,f,anim=nom,sg'
|
35
56
|
|
36
|
-
|
37
|
-
words['мальчиков'].size.should eq 3
|
57
|
+
response.find_all{|h| h[:text] == 'мальчиков'}.first[:analysis].size.should eq 3
|
38
58
|
end
|
39
59
|
|
40
|
-
it '
|
41
|
-
|
42
|
-
|
43
|
-
words['сосланный'].size.should eq 2
|
44
|
-
|
45
|
-
words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
|
46
|
-
words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
|
47
|
-
words['сосланный'].map(&:part).should eq %w(V S)
|
60
|
+
it 'get geo name' do
|
61
|
+
response = YandexMystem::Raw.stem('Москва')
|
62
|
+
response.find_all{|h| h[:text] == 'Москва'}.first[:analysis].first[:gr].should include 'geo'
|
48
63
|
end
|
49
64
|
|
50
|
-
|
51
|
-
YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
|
52
|
-
YandexMystem::Extended.stem('В в в в')
|
53
|
-
end
|
65
|
+
|
54
66
|
end
|
55
67
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yandex_mystem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Polushkin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -43,21 +43,24 @@ description: Mystem is a software that provided by the Yandex only for non-comme
|
|
43
43
|
simple morphological analysis of russian words.
|
44
44
|
email:
|
45
45
|
- dmitry.polushkin@gmail.com
|
46
|
-
executables:
|
46
|
+
executables:
|
47
|
+
- mystem-bsd
|
48
|
+
- mystem-linux-32
|
49
|
+
- mystem-linux-64
|
50
|
+
- mystem-win.exe
|
47
51
|
extensions: []
|
48
52
|
extra_rdoc_files: []
|
49
53
|
files:
|
50
54
|
- ".gitignore"
|
51
55
|
- ".rspec"
|
52
|
-
- ".rvmrc"
|
53
56
|
- ".travis.yml"
|
54
57
|
- Gemfile
|
55
58
|
- README.md
|
56
59
|
- Rakefile
|
57
|
-
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
-
|
60
|
+
- bin/mystem-bsd
|
61
|
+
- bin/mystem-linux-32
|
62
|
+
- bin/mystem-linux-64
|
63
|
+
- bin/mystem-win.exe
|
61
64
|
- lib/yandex_mystem.rb
|
62
65
|
- lib/yandex_mystem/version.rb
|
63
66
|
- spec/spec_helper.rb
|
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm use 2.0.0 --create
|
data/app/mystem-linux-32
DELETED
Binary file
|
data/app/mystem-linux-64
DELETED
Binary file
|
data/app/mystem-mac
DELETED
Binary file
|
data/app/mystem-win.exe
DELETED
Binary file
|