yandex_mystem 0.1.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -7
- data/bin/mystem-bsd +0 -0
- data/bin/mystem-linux-32 +0 -0
- data/bin/mystem-linux-64 +0 -0
- data/bin/mystem-win.exe +0 -0
- data/lib/yandex_mystem/version.rb +1 -1
- data/lib/yandex_mystem.rb +13 -46
- data/spec/yandex_mystem_spec.rb +37 -25
- metadata +11 -8
- data/.rvmrc +0 -1
- data/app/mystem-linux-32 +0 -0
- data/app/mystem-linux-64 +0 -0
- data/app/mystem-mac +0 -0
- data/app/mystem-win.exe +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7c1c4fcbbbafd601e0457db5366a8d82639955d
|
4
|
+
data.tar.gz: 3da18f1e83a23747a51786fcda68b5ccd109dfb2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fdca784ffa0621fc345ffa080e00131c43db0ac86f14012082271ed378a1cfa32ff1145acfecb77ef2d336e8f6a8ed4e976375b81ed3fbedbfd94aaf7213ba97
|
7
|
+
data.tar.gz: 6fa8de5e45b069137e46165d6da1cd7d0209b297e85de996a30340d1171cf8d85eb1194787f0fa40bae12fbb4ccd92ecef0601462cfdc3e2e6664dbf9ff8f234
|
data/README.md
CHANGED
@@ -3,6 +3,12 @@
|
|
3
3
|
[](http://travis-ci.org/dmitry/yandex_mystem) [](http://badge.fury.io/rb/yandex_mystem) [](https://bitdeli.com/free "Bitdeli Badge")
|
4
4
|
|
5
5
|
|
6
|
+
## Version 3.0 not compatible with previous
|
7
|
+
|
8
|
+
Mystem 3.0 support JSON format, so we should use this option.
|
9
|
+
This gem (yandex_mystem) now returns information in JSON-like ruby native format (array of hashes).
|
10
|
+
More info about Mystem changes: http://api.yandex.ru/mystem/downloads/
|
11
|
+
|
6
12
|
## Introduction
|
7
13
|
|
8
14
|
Mystem is a software that provided by the Yandex only for non-commercial project. With use of it you can detect base forms of the words in a text, make a simple morphological analysis of russian words.
|
@@ -13,7 +19,7 @@ Mystem is a software that provided by the Yandex only for non-commercial project
|
|
13
19
|
|
14
20
|
## License
|
15
21
|
|
16
|
-
First of all, read license on http://
|
22
|
+
First of all, read license on http://api.yandex.ru/mystem/
|
17
23
|
|
18
24
|
`Mystem` available only for non-commercial usage.
|
19
25
|
|
@@ -21,14 +27,14 @@ First of all, read license on http://company.yandex.ru/technologies/mystem/
|
|
21
27
|
|
22
28
|
This gem contains executables for there platforms:
|
23
29
|
|
24
|
-
* Windows
|
25
|
-
* Linux
|
26
|
-
* Linux
|
27
|
-
*
|
30
|
+
* Windows 7 32-bit
|
31
|
+
* Linux 3.5 32-bit
|
32
|
+
* Linux 3.1 64-bit
|
33
|
+
* Freebsd 9.0 64-bit
|
28
34
|
|
29
|
-
|
35
|
+
Note: Mystem 3.0 not support Mac OS X, sorry.
|
30
36
|
|
31
37
|
## Usage
|
32
38
|
|
33
39
|
YandexMystem::Simple.stem 'О предложении в котором много слов.'
|
34
|
-
YandexMystem::
|
40
|
+
YandexMystem::Raw.stem 'нет сов'
|
data/bin/mystem-bsd
ADDED
Binary file
|
data/bin/mystem-linux-32
ADDED
Binary file
|
data/bin/mystem-linux-64
ADDED
Binary file
|
data/bin/mystem-win.exe
ADDED
Binary file
|
data/lib/yandex_mystem.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'open3'
|
2
4
|
require 'yandex_mystem/version'
|
3
5
|
require 'pathname'
|
6
|
+
require 'json'
|
4
7
|
|
5
8
|
module YandexMystem
|
6
|
-
class Base
|
7
|
-
WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
|
9
|
+
class Base
|
8
10
|
|
9
11
|
def self.stem(text)
|
10
12
|
exec = [command, self::ARGUMENTS].join(' ')
|
@@ -20,7 +22,7 @@ module YandexMystem
|
|
20
22
|
|
21
23
|
def self.command
|
22
24
|
@command ||= begin
|
23
|
-
path = Pathname.new(__FILE__) + '../../
|
25
|
+
path = Pathname.new(__FILE__) + '../../bin/'
|
24
26
|
path + "mystem-#{command_postfix}"
|
25
27
|
end
|
26
28
|
end
|
@@ -34,9 +36,9 @@ module YandexMystem
|
|
34
36
|
when /64.+linux$/
|
35
37
|
'linux-64'
|
36
38
|
when /darwin/
|
37
|
-
'
|
39
|
+
raise 'Mystem 3.0 does not support Max OS X.'
|
38
40
|
when /freebsd/
|
39
|
-
|
41
|
+
'bsd'
|
40
42
|
else
|
41
43
|
raise 'Unknown OS'
|
42
44
|
end
|
@@ -44,54 +46,19 @@ module YandexMystem
|
|
44
46
|
end
|
45
47
|
|
46
48
|
class Simple < Base
|
47
|
-
ARGUMENTS = '-e utf-8 -n'
|
48
|
-
|
49
|
-
NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
|
49
|
+
ARGUMENTS = '-e utf-8 -n --format json'
|
50
50
|
|
51
51
|
|
52
52
|
def self.parse(data)
|
53
|
-
|
54
|
-
words = words.split('|').select do |w|
|
55
|
-
!(w =~ NOT_INCLUDE_REGEXP)
|
56
|
-
end
|
57
|
-
|
58
|
-
[word, words]
|
59
|
-
end.flatten(1)
|
60
|
-
|
61
|
-
Hash[*parsed]
|
53
|
+
Hash[ JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true).inject([]){|s, h| s + [[ h[:text], h[:analysis].map{|a| a[:lex]} ]]} ]
|
62
54
|
end
|
63
55
|
end
|
64
56
|
|
65
|
-
class
|
66
|
-
ARGUMENTS = '-e utf-8 -
|
67
|
-
|
68
|
-
REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
|
69
|
-
|
70
|
-
Word = Struct.new(:word, :frequency, :part)
|
57
|
+
class Raw < Base
|
58
|
+
ARGUMENTS = '-e utf-8 -ig -n --weight --format json --eng-gr'
|
71
59
|
|
72
60
|
def self.parse(data)
|
73
|
-
|
74
|
-
|
75
|
-
data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
|
76
|
-
unless parsed.key?(word)
|
77
|
-
words = words.scan(REGEXP).map do |w|
|
78
|
-
to_word(w)
|
79
|
-
end
|
80
|
-
|
81
|
-
unless words.size.zero?
|
82
|
-
parsed[word] = words.sort_by(&:frequency).reverse
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
parsed
|
88
|
-
end
|
89
|
-
|
90
|
-
private
|
91
|
-
|
92
|
-
def self.to_word(w)
|
93
|
-
word, frequency, part = w
|
94
|
-
Word.new(word, frequency.to_f, part)
|
95
|
-
end
|
61
|
+
JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true)
|
62
|
+
end
|
96
63
|
end
|
97
64
|
end
|
data/spec/yandex_mystem_spec.rb
CHANGED
@@ -4,7 +4,28 @@ require 'spec_helper'
|
|
4
4
|
describe YandexMystem do
|
5
5
|
context YandexMystem::Simple do
|
6
6
|
it "should stem words" do
|
7
|
-
data = YandexMystem::Simple.stem(
|
7
|
+
data = YandexMystem::Simple.stem("мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements")
|
8
|
+
data['мальчики'].should eq %w(мальчик)
|
9
|
+
data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
|
10
|
+
data['девочки'].should eq %w(девочка)
|
11
|
+
data['девочек'].should eq %w(девочка)
|
12
|
+
data['сов'].should eq %w(сова)
|
13
|
+
data['пошли'].should eq %w(пойти посылать)
|
14
|
+
data['elements'].should eq []
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should stem words in few lines" do
|
18
|
+
data = YandexMystem::Simple.stem(%[
|
19
|
+
|
20
|
+
|
21
|
+
мальчики
|
22
|
+
мальчиков
|
23
|
+
девочки девочек компьютеров компьютере сов пошли
|
24
|
+
elements
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
])
|
8
29
|
data['мальчики'].should eq %w(мальчик)
|
9
30
|
data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
|
10
31
|
data['девочки'].should eq %w(девочка)
|
@@ -15,41 +36,32 @@ describe YandexMystem do
|
|
15
36
|
end
|
16
37
|
end
|
17
38
|
|
18
|
-
context YandexMystem::
|
39
|
+
context YandexMystem::Raw do
|
19
40
|
it 'latin words' do
|
20
|
-
|
21
|
-
|
22
|
-
words.size.should eq 0
|
41
|
+
response = YandexMystem::Raw.stem('Elements')
|
23
42
|
|
24
|
-
|
43
|
+
response.size.should eq 1
|
44
|
+
response[0][:analysis].should eq []
|
45
|
+
response[0][:text].should eq 'Elements'
|
25
46
|
end
|
26
47
|
|
27
48
|
it 'multiple definitions' do
|
28
|
-
|
49
|
+
response = YandexMystem::Raw.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
|
29
50
|
|
30
|
-
|
51
|
+
response.size.should eq 11
|
31
52
|
|
32
|
-
|
33
|
-
|
34
|
-
|
53
|
+
response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:lex].should eq 'девочка'
|
54
|
+
response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:wt].should eq 1
|
55
|
+
response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:gr].should eq 'S,f,anim=nom,sg'
|
35
56
|
|
36
|
-
|
37
|
-
words['мальчиков'].size.should eq 3
|
57
|
+
response.find_all{|h| h[:text] == 'мальчиков'}.first[:analysis].size.should eq 3
|
38
58
|
end
|
39
59
|
|
40
|
-
it '
|
41
|
-
|
42
|
-
|
43
|
-
words['сосланный'].size.should eq 2
|
44
|
-
|
45
|
-
words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
|
46
|
-
words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
|
47
|
-
words['сосланный'].map(&:part).should eq %w(V S)
|
60
|
+
it 'get geo name' do
|
61
|
+
response = YandexMystem::Raw.stem('Москва')
|
62
|
+
response.find_all{|h| h[:text] == 'Москва'}.first[:analysis].first[:gr].should include 'geo'
|
48
63
|
end
|
49
64
|
|
50
|
-
|
51
|
-
YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
|
52
|
-
YandexMystem::Extended.stem('В в в в')
|
53
|
-
end
|
65
|
+
|
54
66
|
end
|
55
67
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yandex_mystem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Polushkin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -43,21 +43,24 @@ description: Mystem is a software that provided by the Yandex only for non-comme
|
|
43
43
|
simple morphological analysis of russian words.
|
44
44
|
email:
|
45
45
|
- dmitry.polushkin@gmail.com
|
46
|
-
executables:
|
46
|
+
executables:
|
47
|
+
- mystem-bsd
|
48
|
+
- mystem-linux-32
|
49
|
+
- mystem-linux-64
|
50
|
+
- mystem-win.exe
|
47
51
|
extensions: []
|
48
52
|
extra_rdoc_files: []
|
49
53
|
files:
|
50
54
|
- ".gitignore"
|
51
55
|
- ".rspec"
|
52
|
-
- ".rvmrc"
|
53
56
|
- ".travis.yml"
|
54
57
|
- Gemfile
|
55
58
|
- README.md
|
56
59
|
- Rakefile
|
57
|
-
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
-
|
60
|
+
- bin/mystem-bsd
|
61
|
+
- bin/mystem-linux-32
|
62
|
+
- bin/mystem-linux-64
|
63
|
+
- bin/mystem-win.exe
|
61
64
|
- lib/yandex_mystem.rb
|
62
65
|
- lib/yandex_mystem/version.rb
|
63
66
|
- spec/spec_helper.rb
|
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm use 2.0.0 --create
|
data/app/mystem-linux-32
DELETED
Binary file
|
data/app/mystem-linux-64
DELETED
Binary file
|
data/app/mystem-mac
DELETED
Binary file
|
data/app/mystem-win.exe
DELETED
Binary file
|