yandex_mystem 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rvmrc +1 -1
- data/.travis.yml +3 -0
- data/README.md +6 -2
- data/lib/yandex_mystem/version.rb +1 -1
- data/lib/yandex_mystem.rb +61 -19
- data/spec/yandex_mystem_spec.rb +50 -10
- data/yandex_mystem.gemspec +1 -0
- metadata +22 -11
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cba5e52c1b005179967ad5a066786c5050342018
|
4
|
+
data.tar.gz: f9112c1e6ce51124d48279f8218fa5d522b2ded8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9337422e5f0293516d4b8c48a5adc3ae1d48f1682d403d700480a4f9f71ee5a71f68bec89525001b6dc05ce7e01193a2bc609c089e8e603fa2bff271740d2a20
|
7
|
+
data.tar.gz: 1c84c101387557db4f8760445168412b3f14053e678689c63e2237b21137b978dea4c23f4633f26e2cd33a07cfe8b061f90339764056072d59a1bc58dd3de0a8
|
data/.rvmrc
CHANGED
@@ -1 +1 @@
|
|
1
|
-
rvm use
|
1
|
+
rvm use 2.0.0 --create
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
# Yandex Mystem
|
1
|
+
# Yandex Mystem 0.1.0
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/dmitry/yandex_mystem.png?branch=master)](http://travis-ci.org/dmitry/yandex_mystem) [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dmitry/yandex_mystem/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
|
4
|
+
|
2
5
|
|
3
6
|
## Introduction
|
4
7
|
|
@@ -27,4 +30,5 @@ This gem contains executables for there platforms:
|
|
27
30
|
|
28
31
|
## Usage
|
29
32
|
|
30
|
-
YandexMystem::
|
33
|
+
YandexMystem::Simple.stem 'О предложении в котором много слов.'
|
34
|
+
YandexMystem::Extended.stem 'нет сов'
|
data/lib/yandex_mystem.rb
CHANGED
@@ -3,35 +3,25 @@ require 'yandex_mystem/version'
|
|
3
3
|
|
4
4
|
module YandexMystem
|
5
5
|
class Base
|
6
|
-
|
6
|
+
WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
|
7
|
+
|
7
8
|
def self.stem(text)
|
8
|
-
exec =
|
9
|
-
c << '-e utf-8 -n'
|
10
|
-
end.join(' ')
|
9
|
+
exec = [command, self::ARGUMENTS].join(' ')
|
11
10
|
|
12
|
-
data = Open3.popen3(exec) do |stdin, stdout,
|
11
|
+
data = Open3.popen3(exec) do |stdin, stdout, _|
|
13
12
|
stdin.write text
|
14
13
|
stdin.close
|
15
|
-
#stderr.read
|
16
14
|
stdout.read
|
17
15
|
end
|
18
16
|
|
19
|
-
data
|
20
|
-
words = words.split('|').select do |w|
|
21
|
-
!(w =~ /.+\?\?$/)
|
22
|
-
end
|
23
|
-
|
24
|
-
[word, words]
|
25
|
-
end.flatten(1)
|
26
|
-
|
27
|
-
Hash[*data]
|
17
|
+
parse(data)
|
28
18
|
end
|
29
19
|
|
30
|
-
private
|
31
|
-
|
32
20
|
def self.command
|
33
|
-
|
34
|
-
|
21
|
+
@command ||= begin
|
22
|
+
path = Pathname.new(__FILE__) + '../../app/'
|
23
|
+
path + "mystem-#{command_postfix}"
|
24
|
+
end
|
35
25
|
end
|
36
26
|
|
37
27
|
def self.command_postfix
|
@@ -51,4 +41,56 @@ module YandexMystem
|
|
51
41
|
end
|
52
42
|
end
|
53
43
|
end
|
44
|
+
|
45
|
+
class Simple < Base
|
46
|
+
ARGUMENTS = '-e utf-8 -n'
|
47
|
+
|
48
|
+
NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
|
49
|
+
|
50
|
+
|
51
|
+
def self.parse(data)
|
52
|
+
parsed = data.scan(WORD_SCANNER_REGEXP).map do |(word, words)|
|
53
|
+
words = words.split('|').select do |w|
|
54
|
+
!(w =~ NOT_INCLUDE_REGEXP)
|
55
|
+
end
|
56
|
+
|
57
|
+
[word, words]
|
58
|
+
end.flatten(1)
|
59
|
+
|
60
|
+
Hash[*parsed]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class Extended < Base
|
65
|
+
ARGUMENTS = '-e utf-8 -nifg'
|
66
|
+
|
67
|
+
REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
|
68
|
+
|
69
|
+
Word = Struct.new(:word, :frequency, :part)
|
70
|
+
|
71
|
+
def self.parse(data)
|
72
|
+
parsed = {}
|
73
|
+
|
74
|
+
data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
|
75
|
+
unless parsed.key?(word)
|
76
|
+
words = words.scan(REGEXP).map do |w|
|
77
|
+
to_word(w)
|
78
|
+
end
|
79
|
+
|
80
|
+
unless words.size.zero?
|
81
|
+
parsed[word] = words.sort_by(&:frequency).reverse
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
parsed
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def self.to_word(w)
|
92
|
+
word, frequency, part = w
|
93
|
+
Word.new(word, frequency.to_f, part)
|
94
|
+
end
|
95
|
+
end
|
54
96
|
end
|
data/spec/yandex_mystem_spec.rb
CHANGED
@@ -2,14 +2,54 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe YandexMystem do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
5
|
+
context YandexMystem::Simple do
|
6
|
+
it "should stem words" do
|
7
|
+
data = YandexMystem::Simple.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
|
8
|
+
data['мальчики'].should eq %w(мальчик)
|
9
|
+
data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
|
10
|
+
data['девочки'].should eq %w(девочка)
|
11
|
+
data['девочек'].should eq %w(девочка)
|
12
|
+
data['сов'].should eq %w(сова)
|
13
|
+
data['пошли'].should eq %w(пойти посылать)
|
14
|
+
data['elements'].should eq []
|
15
|
+
end
|
14
16
|
end
|
15
|
-
|
17
|
+
|
18
|
+
context YandexMystem::Extended do
|
19
|
+
it 'latin words' do
|
20
|
+
words = YandexMystem::Extended.stem('elements')
|
21
|
+
|
22
|
+
words.size.should eq 0
|
23
|
+
|
24
|
+
words['elements'].should be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'multiple definitions' do
|
28
|
+
words = YandexMystem::Extended.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
|
29
|
+
|
30
|
+
words.size.should eq 11
|
31
|
+
|
32
|
+
words['девочка'].first.word.should eq 'девочка'
|
33
|
+
words['девочка'].first.frequency.should eq 185.2
|
34
|
+
words['девочка'].first.part.should eq 'S'
|
35
|
+
|
36
|
+
|
37
|
+
words['мальчиков'].size.should eq 3
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'sort by frequency' do
|
41
|
+
words = YandexMystem::Extended.stem('сосланный')
|
42
|
+
|
43
|
+
words['сосланный'].size.should eq 2
|
44
|
+
|
45
|
+
words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
|
46
|
+
words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
|
47
|
+
words['сосланный'].map(&:part).should eq %w(V S)
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'set multiple times the same word, but lowercase and uppercase is different' do
|
51
|
+
YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
|
52
|
+
YandexMystem::Extended.stem('В в в в')
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/yandex_mystem.gemspec
CHANGED
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yandex_mystem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Dmitry Polushkin
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2013-08-04 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rspec
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ~>
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,11 +20,24 @@ dependencies:
|
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ~>
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '2.8'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.1'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.1'
|
30
41
|
description: Mystem is a software that provided by the Yandex only for non-commercial
|
31
42
|
project. With use of it you can detect base forms of the words in a text, make a
|
32
43
|
simple morphological analysis of russian words.
|
@@ -39,6 +50,7 @@ files:
|
|
39
50
|
- .gitignore
|
40
51
|
- .rspec
|
41
52
|
- .rvmrc
|
53
|
+
- .travis.yml
|
42
54
|
- Gemfile
|
43
55
|
- README.md
|
44
56
|
- Rakefile
|
@@ -53,26 +65,25 @@ files:
|
|
53
65
|
- yandex_mystem.gemspec
|
54
66
|
homepage: ''
|
55
67
|
licenses: []
|
68
|
+
metadata: {}
|
56
69
|
post_install_message:
|
57
70
|
rdoc_options: []
|
58
71
|
require_paths:
|
59
72
|
- lib
|
60
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
-
none: false
|
62
74
|
requirements:
|
63
|
-
- -
|
75
|
+
- - '>='
|
64
76
|
- !ruby/object:Gem::Version
|
65
77
|
version: '0'
|
66
78
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
-
none: false
|
68
79
|
requirements:
|
69
|
-
- -
|
80
|
+
- - '>='
|
70
81
|
- !ruby/object:Gem::Version
|
71
82
|
version: '0'
|
72
83
|
requirements: []
|
73
84
|
rubyforge_project:
|
74
|
-
rubygems_version:
|
85
|
+
rubygems_version: 2.0.3
|
75
86
|
signing_key:
|
76
|
-
specification_version:
|
87
|
+
specification_version: 4
|
77
88
|
summary: Yandex Mystem makes morphological analysis of a russian text
|
78
89
|
test_files: []
|