yandex_mystem 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cba5e52c1b005179967ad5a066786c5050342018
4
+ data.tar.gz: f9112c1e6ce51124d48279f8218fa5d522b2ded8
5
+ SHA512:
6
+ metadata.gz: 9337422e5f0293516d4b8c48a5adc3ae1d48f1682d403d700480a4f9f71ee5a71f68bec89525001b6dc05ce7e01193a2bc609c089e8e603fa2bff271740d2a20
7
+ data.tar.gz: 1c84c101387557db4f8760445168412b3f14053e678689c63e2237b21137b978dea4c23f4633f26e2cd33a07cfe8b061f90339764056072d59a1bc58dd3de0a8
data/.rvmrc CHANGED
@@ -1 +1 @@
1
- rvm use 1.9.3 --create
1
+ rvm use 2.0.0 --create
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - 2.0.0
data/README.md CHANGED
@@ -1,4 +1,7 @@
1
- # Yandex Mystem
1
+ # Yandex Mystem 0.1.0
2
+
3
+ [![Build Status](https://secure.travis-ci.org/dmitry/yandex_mystem.png?branch=master)](http://travis-ci.org/dmitry/yandex_mystem) [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dmitry/yandex_mystem/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
4
+
2
5
 
3
6
  ## Introduction
4
7
 
@@ -27,4 +30,5 @@ This gem contains executables for there platforms:
27
30
 
28
31
  ## Usage
29
32
 
30
- YandexMystem::Base.stem 'О предложении в котором много слов.'
33
+ YandexMystem::Simple.stem 'О предложении в котором много слов.'
34
+ YandexMystem::Extended.stem 'нет сов'
@@ -1,3 +1,3 @@
1
1
  module YandexMystem
2
- VERSION = "0.0.2"
2
+ VERSION = '0.1.0'
3
3
  end
data/lib/yandex_mystem.rb CHANGED
@@ -3,35 +3,25 @@ require 'yandex_mystem/version'
3
3
 
4
4
  module YandexMystem
5
5
  class Base
6
- # TODO add -i
6
+ WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
7
+
7
8
  def self.stem(text)
8
- exec = Array(command).tap do |c|
9
- c << '-e utf-8 -n'
10
- end.join(' ')
9
+ exec = [command, self::ARGUMENTS].join(' ')
11
10
 
12
- data = Open3.popen3(exec) do |stdin, stdout, stderr|
11
+ data = Open3.popen3(exec) do |stdin, stdout, _|
13
12
  stdin.write text
14
13
  stdin.close
15
- #stderr.read
16
14
  stdout.read
17
15
  end
18
16
 
19
- data = data.scan(/^([^\{]+)\{(.+)\}$/).map do |(word, words)|
20
- words = words.split('|').select do |w|
21
- !(w =~ /.+\?\?$/)
22
- end
23
-
24
- [word, words]
25
- end.flatten(1)
26
-
27
- Hash[*data]
17
+ parse(data)
28
18
  end
29
19
 
30
- private
31
-
32
20
  def self.command
33
- path = Pathname.new(__FILE__) + '../../app/'
34
- path + "mystem-#{command_postfix}"
21
+ @command ||= begin
22
+ path = Pathname.new(__FILE__) + '../../app/'
23
+ path + "mystem-#{command_postfix}"
24
+ end
35
25
  end
36
26
 
37
27
  def self.command_postfix
@@ -51,4 +41,56 @@ module YandexMystem
51
41
  end
52
42
  end
53
43
  end
44
+
45
+ class Simple < Base
46
+ ARGUMENTS = '-e utf-8 -n'
47
+
48
+ NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
49
+
50
+
51
+ def self.parse(data)
52
+ parsed = data.scan(WORD_SCANNER_REGEXP).map do |(word, words)|
53
+ words = words.split('|').select do |w|
54
+ !(w =~ NOT_INCLUDE_REGEXP)
55
+ end
56
+
57
+ [word, words]
58
+ end.flatten(1)
59
+
60
+ Hash[*parsed]
61
+ end
62
+ end
63
+
64
+ class Extended < Base
65
+ ARGUMENTS = '-e utf-8 -nifg'
66
+
67
+ REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
68
+
69
+ Word = Struct.new(:word, :frequency, :part)
70
+
71
+ def self.parse(data)
72
+ parsed = {}
73
+
74
+ data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
75
+ unless parsed.key?(word)
76
+ words = words.scan(REGEXP).map do |w|
77
+ to_word(w)
78
+ end
79
+
80
+ unless words.size.zero?
81
+ parsed[word] = words.sort_by(&:frequency).reverse
82
+ end
83
+ end
84
+ end
85
+
86
+ parsed
87
+ end
88
+
89
+ private
90
+
91
+ def self.to_word(w)
92
+ word, frequency, part = w
93
+ Word.new(word, frequency.to_f, part)
94
+ end
95
+ end
54
96
  end
@@ -2,14 +2,54 @@
2
2
  require 'spec_helper'
3
3
 
4
4
  describe YandexMystem do
5
- it "should stem words" do
6
- data = YandexMystem::Base.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
7
- data['мальчики'].should eq ['мальчик']
8
- data['мальчиков'].should eq ['мальчик', "мальчиков", "мальчиковый"]
9
- data['девочки'].should eq ['девочка']
10
- data['девочек'].should eq ['девочка']
11
- data['сов'].should eq ['сова']
12
- data['пошли'].should eq %w(пойти посылать)
13
- data['elements'].should eq []
5
+ context YandexMystem::Simple do
6
+ it "should stem words" do
7
+ data = YandexMystem::Simple.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
8
+ data['мальчики'].should eq %w(мальчик)
9
+ data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
10
+ data['девочки'].should eq %w(девочка)
11
+ data['девочек'].should eq %w(девочка)
12
+ data['сов'].should eq %w(сова)
13
+ data['пошли'].should eq %w(пойти посылать)
14
+ data['elements'].should eq []
15
+ end
14
16
  end
15
- end
17
+
18
+ context YandexMystem::Extended do
19
+ it 'latin words' do
20
+ words = YandexMystem::Extended.stem('elements')
21
+
22
+ words.size.should eq 0
23
+
24
+ words['elements'].should be_nil
25
+ end
26
+
27
+ it 'multiple definitions' do
28
+ words = YandexMystem::Extended.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
29
+
30
+ words.size.should eq 11
31
+
32
+ words['девочка'].first.word.should eq 'девочка'
33
+ words['девочка'].first.frequency.should eq 185.2
34
+ words['девочка'].first.part.should eq 'S'
35
+
36
+
37
+ words['мальчиков'].size.should eq 3
38
+ end
39
+
40
+ it 'sort by frequency' do
41
+ words = YandexMystem::Extended.stem('сосланный')
42
+
43
+ words['сосланный'].size.should eq 2
44
+
45
+ words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
46
+ words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
47
+ words['сосланный'].map(&:part).should eq %w(V S)
48
+ end
49
+
50
+ it 'set multiple times the same word, but lowercase and uppercase is different' do
51
+ YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
52
+ YandexMystem::Extended.stem('В в в в')
53
+ end
54
+ end
55
+ end
@@ -16,4 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.version = YandexMystem::VERSION
17
17
 
18
18
  gem.add_development_dependency "rspec", '~> 2.8'
19
+ gem.add_development_dependency "rake", '~> 10.1'
19
20
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yandex_mystem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Dmitry Polushkin
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-06-28 00:00:00.000000000 Z
11
+ date: 2013-08-04 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rspec
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ~>
20
18
  - !ruby/object:Gem::Version
@@ -22,11 +20,24 @@ dependencies:
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ~>
28
25
  - !ruby/object:Gem::Version
29
26
  version: '2.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.1'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.1'
30
41
  description: Mystem is a software that provided by the Yandex only for non-commercial
31
42
  project. With use of it you can detect base forms of the words in a text, make a
32
43
  simple morphological analysis of russian words.
@@ -39,6 +50,7 @@ files:
39
50
  - .gitignore
40
51
  - .rspec
41
52
  - .rvmrc
53
+ - .travis.yml
42
54
  - Gemfile
43
55
  - README.md
44
56
  - Rakefile
@@ -53,26 +65,25 @@ files:
53
65
  - yandex_mystem.gemspec
54
66
  homepage: ''
55
67
  licenses: []
68
+ metadata: {}
56
69
  post_install_message:
57
70
  rdoc_options: []
58
71
  require_paths:
59
72
  - lib
60
73
  required_ruby_version: !ruby/object:Gem::Requirement
61
- none: false
62
74
  requirements:
63
- - - ! '>='
75
+ - - '>='
64
76
  - !ruby/object:Gem::Version
65
77
  version: '0'
66
78
  required_rubygems_version: !ruby/object:Gem::Requirement
67
- none: false
68
79
  requirements:
69
- - - ! '>='
80
+ - - '>='
70
81
  - !ruby/object:Gem::Version
71
82
  version: '0'
72
83
  requirements: []
73
84
  rubyforge_project:
74
- rubygems_version: 1.8.24
85
+ rubygems_version: 2.0.3
75
86
  signing_key:
76
- specification_version: 3
87
+ specification_version: 4
77
88
  summary: Yandex Mystem makes morphological analysis of a russian text
78
89
  test_files: []