yandex_mystem 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rvmrc +1 -1
- data/.travis.yml +3 -0
- data/README.md +6 -2
- data/lib/yandex_mystem/version.rb +1 -1
- data/lib/yandex_mystem.rb +61 -19
- data/spec/yandex_mystem_spec.rb +50 -10
- data/yandex_mystem.gemspec +1 -0
- metadata +22 -11
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cba5e52c1b005179967ad5a066786c5050342018
|
4
|
+
data.tar.gz: f9112c1e6ce51124d48279f8218fa5d522b2ded8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9337422e5f0293516d4b8c48a5adc3ae1d48f1682d403d700480a4f9f71ee5a71f68bec89525001b6dc05ce7e01193a2bc609c089e8e603fa2bff271740d2a20
|
7
|
+
data.tar.gz: 1c84c101387557db4f8760445168412b3f14053e678689c63e2237b21137b978dea4c23f4633f26e2cd33a07cfe8b061f90339764056072d59a1bc58dd3de0a8
|
data/.rvmrc
CHANGED
@@ -1 +1 @@
|
|
1
|
-
rvm use
|
1
|
+
rvm use 2.0.0 --create
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
# Yandex Mystem
|
1
|
+
# Yandex Mystem 0.1.0
|
2
|
+
|
3
|
+
[](http://travis-ci.org/dmitry/yandex_mystem) [](https://bitdeli.com/free "Bitdeli Badge")
|
4
|
+
|
2
5
|
|
3
6
|
## Introduction
|
4
7
|
|
@@ -27,4 +30,5 @@ This gem contains executables for there platforms:
|
|
27
30
|
|
28
31
|
## Usage
|
29
32
|
|
30
|
-
YandexMystem::
|
33
|
+
YandexMystem::Simple.stem 'О предложении в котором много слов.'
|
34
|
+
YandexMystem::Extended.stem 'нет сов'
|
data/lib/yandex_mystem.rb
CHANGED
@@ -3,35 +3,25 @@ require 'yandex_mystem/version'
|
|
3
3
|
|
4
4
|
module YandexMystem
|
5
5
|
class Base
|
6
|
-
|
6
|
+
WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
|
7
|
+
|
7
8
|
def self.stem(text)
|
8
|
-
exec =
|
9
|
-
c << '-e utf-8 -n'
|
10
|
-
end.join(' ')
|
9
|
+
exec = [command, self::ARGUMENTS].join(' ')
|
11
10
|
|
12
|
-
data = Open3.popen3(exec) do |stdin, stdout,
|
11
|
+
data = Open3.popen3(exec) do |stdin, stdout, _|
|
13
12
|
stdin.write text
|
14
13
|
stdin.close
|
15
|
-
#stderr.read
|
16
14
|
stdout.read
|
17
15
|
end
|
18
16
|
|
19
|
-
data
|
20
|
-
words = words.split('|').select do |w|
|
21
|
-
!(w =~ /.+\?\?$/)
|
22
|
-
end
|
23
|
-
|
24
|
-
[word, words]
|
25
|
-
end.flatten(1)
|
26
|
-
|
27
|
-
Hash[*data]
|
17
|
+
parse(data)
|
28
18
|
end
|
29
19
|
|
30
|
-
private
|
31
|
-
|
32
20
|
def self.command
|
33
|
-
|
34
|
-
|
21
|
+
@command ||= begin
|
22
|
+
path = Pathname.new(__FILE__) + '../../app/'
|
23
|
+
path + "mystem-#{command_postfix}"
|
24
|
+
end
|
35
25
|
end
|
36
26
|
|
37
27
|
def self.command_postfix
|
@@ -51,4 +41,56 @@ module YandexMystem
|
|
51
41
|
end
|
52
42
|
end
|
53
43
|
end
|
44
|
+
|
45
|
+
class Simple < Base
|
46
|
+
ARGUMENTS = '-e utf-8 -n'
|
47
|
+
|
48
|
+
NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
|
49
|
+
|
50
|
+
|
51
|
+
def self.parse(data)
|
52
|
+
parsed = data.scan(WORD_SCANNER_REGEXP).map do |(word, words)|
|
53
|
+
words = words.split('|').select do |w|
|
54
|
+
!(w =~ NOT_INCLUDE_REGEXP)
|
55
|
+
end
|
56
|
+
|
57
|
+
[word, words]
|
58
|
+
end.flatten(1)
|
59
|
+
|
60
|
+
Hash[*parsed]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class Extended < Base
|
65
|
+
ARGUMENTS = '-e utf-8 -nifg'
|
66
|
+
|
67
|
+
REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
|
68
|
+
|
69
|
+
Word = Struct.new(:word, :frequency, :part)
|
70
|
+
|
71
|
+
def self.parse(data)
|
72
|
+
parsed = {}
|
73
|
+
|
74
|
+
data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
|
75
|
+
unless parsed.key?(word)
|
76
|
+
words = words.scan(REGEXP).map do |w|
|
77
|
+
to_word(w)
|
78
|
+
end
|
79
|
+
|
80
|
+
unless words.size.zero?
|
81
|
+
parsed[word] = words.sort_by(&:frequency).reverse
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
parsed
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def self.to_word(w)
|
92
|
+
word, frequency, part = w
|
93
|
+
Word.new(word, frequency.to_f, part)
|
94
|
+
end
|
95
|
+
end
|
54
96
|
end
|
data/spec/yandex_mystem_spec.rb
CHANGED
@@ -2,14 +2,54 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe YandexMystem do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
5
|
+
context YandexMystem::Simple do
|
6
|
+
it "should stem words" do
|
7
|
+
data = YandexMystem::Simple.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
|
8
|
+
data['мальчики'].should eq %w(мальчик)
|
9
|
+
data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
|
10
|
+
data['девочки'].should eq %w(девочка)
|
11
|
+
data['девочек'].should eq %w(девочка)
|
12
|
+
data['сов'].should eq %w(сова)
|
13
|
+
data['пошли'].should eq %w(пойти посылать)
|
14
|
+
data['elements'].should eq []
|
15
|
+
end
|
14
16
|
end
|
15
|
-
|
17
|
+
|
18
|
+
context YandexMystem::Extended do
|
19
|
+
it 'latin words' do
|
20
|
+
words = YandexMystem::Extended.stem('elements')
|
21
|
+
|
22
|
+
words.size.should eq 0
|
23
|
+
|
24
|
+
words['elements'].should be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'multiple definitions' do
|
28
|
+
words = YandexMystem::Extended.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
|
29
|
+
|
30
|
+
words.size.should eq 11
|
31
|
+
|
32
|
+
words['девочка'].first.word.should eq 'девочка'
|
33
|
+
words['девочка'].first.frequency.should eq 185.2
|
34
|
+
words['девочка'].first.part.should eq 'S'
|
35
|
+
|
36
|
+
|
37
|
+
words['мальчиков'].size.should eq 3
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'sort by frequency' do
|
41
|
+
words = YandexMystem::Extended.stem('сосланный')
|
42
|
+
|
43
|
+
words['сосланный'].size.should eq 2
|
44
|
+
|
45
|
+
words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
|
46
|
+
words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
|
47
|
+
words['сосланный'].map(&:part).should eq %w(V S)
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'set multiple times the same word, but lowercase and uppercase is different' do
|
51
|
+
YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
|
52
|
+
YandexMystem::Extended.stem('В в в в')
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/yandex_mystem.gemspec
CHANGED
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yandex_mystem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Dmitry Polushkin
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2013-08-04 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rspec
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ~>
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,11 +20,24 @@ dependencies:
|
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ~>
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '2.8'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.1'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.1'
|
30
41
|
description: Mystem is a software that provided by the Yandex only for non-commercial
|
31
42
|
project. With use of it you can detect base forms of the words in a text, make a
|
32
43
|
simple morphological analysis of russian words.
|
@@ -39,6 +50,7 @@ files:
|
|
39
50
|
- .gitignore
|
40
51
|
- .rspec
|
41
52
|
- .rvmrc
|
53
|
+
- .travis.yml
|
42
54
|
- Gemfile
|
43
55
|
- README.md
|
44
56
|
- Rakefile
|
@@ -53,26 +65,25 @@ files:
|
|
53
65
|
- yandex_mystem.gemspec
|
54
66
|
homepage: ''
|
55
67
|
licenses: []
|
68
|
+
metadata: {}
|
56
69
|
post_install_message:
|
57
70
|
rdoc_options: []
|
58
71
|
require_paths:
|
59
72
|
- lib
|
60
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
-
none: false
|
62
74
|
requirements:
|
63
|
-
- -
|
75
|
+
- - '>='
|
64
76
|
- !ruby/object:Gem::Version
|
65
77
|
version: '0'
|
66
78
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
-
none: false
|
68
79
|
requirements:
|
69
|
-
- -
|
80
|
+
- - '>='
|
70
81
|
- !ruby/object:Gem::Version
|
71
82
|
version: '0'
|
72
83
|
requirements: []
|
73
84
|
rubyforge_project:
|
74
|
-
rubygems_version:
|
85
|
+
rubygems_version: 2.0.3
|
75
86
|
signing_key:
|
76
|
-
specification_version:
|
87
|
+
specification_version: 4
|
77
88
|
summary: Yandex Mystem makes morphological analysis of a russian text
|
78
89
|
test_files: []
|