yandex_mystem 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cba5e52c1b005179967ad5a066786c5050342018
4
+ data.tar.gz: f9112c1e6ce51124d48279f8218fa5d522b2ded8
5
+ SHA512:
6
+ metadata.gz: 9337422e5f0293516d4b8c48a5adc3ae1d48f1682d403d700480a4f9f71ee5a71f68bec89525001b6dc05ce7e01193a2bc609c089e8e603fa2bff271740d2a20
7
+ data.tar.gz: 1c84c101387557db4f8760445168412b3f14053e678689c63e2237b21137b978dea4c23f4633f26e2cd33a07cfe8b061f90339764056072d59a1bc58dd3de0a8
data/.rvmrc CHANGED
@@ -1 +1 @@
1
- rvm use 1.9.3 --create
1
+ rvm use 2.0.0 --create
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - 2.0.0
data/README.md CHANGED
@@ -1,4 +1,7 @@
1
- # Yandex Mystem
1
+ # Yandex Mystem 0.1.0
2
+
3
+ [![Build Status](https://secure.travis-ci.org/dmitry/yandex_mystem.png?branch=master)](http://travis-ci.org/dmitry/yandex_mystem) [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dmitry/yandex_mystem/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
4
+
2
5
 
3
6
  ## Introduction
4
7
 
@@ -27,4 +30,5 @@ This gem contains executables for there platforms:
27
30
 
28
31
  ## Usage
29
32
 
30
- YandexMystem::Base.stem 'О предложении в котором много слов.'
33
+ YandexMystem::Simple.stem 'О предложении в котором много слов.'
34
+ YandexMystem::Extended.stem 'нет сов'
@@ -1,3 +1,3 @@
1
1
  module YandexMystem
2
- VERSION = "0.0.2"
2
+ VERSION = '0.1.0'
3
3
  end
data/lib/yandex_mystem.rb CHANGED
@@ -3,35 +3,25 @@ require 'yandex_mystem/version'
3
3
 
4
4
  module YandexMystem
5
5
  class Base
6
- # TODO add -i
6
+ WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
7
+
7
8
  def self.stem(text)
8
- exec = Array(command).tap do |c|
9
- c << '-e utf-8 -n'
10
- end.join(' ')
9
+ exec = [command, self::ARGUMENTS].join(' ')
11
10
 
12
- data = Open3.popen3(exec) do |stdin, stdout, stderr|
11
+ data = Open3.popen3(exec) do |stdin, stdout, _|
13
12
  stdin.write text
14
13
  stdin.close
15
- #stderr.read
16
14
  stdout.read
17
15
  end
18
16
 
19
- data = data.scan(/^([^\{]+)\{(.+)\}$/).map do |(word, words)|
20
- words = words.split('|').select do |w|
21
- !(w =~ /.+\?\?$/)
22
- end
23
-
24
- [word, words]
25
- end.flatten(1)
26
-
27
- Hash[*data]
17
+ parse(data)
28
18
  end
29
19
 
30
- private
31
-
32
20
  def self.command
33
- path = Pathname.new(__FILE__) + '../../app/'
34
- path + "mystem-#{command_postfix}"
21
+ @command ||= begin
22
+ path = Pathname.new(__FILE__) + '../../app/'
23
+ path + "mystem-#{command_postfix}"
24
+ end
35
25
  end
36
26
 
37
27
  def self.command_postfix
@@ -51,4 +41,56 @@ module YandexMystem
51
41
  end
52
42
  end
53
43
  end
44
+
45
+ class Simple < Base
46
+ ARGUMENTS = '-e utf-8 -n'
47
+
48
+ NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
49
+
50
+
51
+ def self.parse(data)
52
+ parsed = data.scan(WORD_SCANNER_REGEXP).map do |(word, words)|
53
+ words = words.split('|').select do |w|
54
+ !(w =~ NOT_INCLUDE_REGEXP)
55
+ end
56
+
57
+ [word, words]
58
+ end.flatten(1)
59
+
60
+ Hash[*parsed]
61
+ end
62
+ end
63
+
64
+ class Extended < Base
65
+ ARGUMENTS = '-e utf-8 -nifg'
66
+
67
+ REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
68
+
69
+ Word = Struct.new(:word, :frequency, :part)
70
+
71
+ def self.parse(data)
72
+ parsed = {}
73
+
74
+ data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
75
+ unless parsed.key?(word)
76
+ words = words.scan(REGEXP).map do |w|
77
+ to_word(w)
78
+ end
79
+
80
+ unless words.size.zero?
81
+ parsed[word] = words.sort_by(&:frequency).reverse
82
+ end
83
+ end
84
+ end
85
+
86
+ parsed
87
+ end
88
+
89
+ private
90
+
91
+ def self.to_word(w)
92
+ word, frequency, part = w
93
+ Word.new(word, frequency.to_f, part)
94
+ end
95
+ end
54
96
  end
@@ -2,14 +2,54 @@
2
2
  require 'spec_helper'
3
3
 
4
4
  describe YandexMystem do
5
- it "should stem words" do
6
- data = YandexMystem::Base.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
7
- data['мальчики'].should eq ['мальчик']
8
- data['мальчиков'].should eq ['мальчик', "мальчиков", "мальчиковый"]
9
- data['девочки'].should eq ['девочка']
10
- data['девочек'].should eq ['девочка']
11
- data['сов'].should eq ['сова']
12
- data['пошли'].should eq %w(пойти посылать)
13
- data['elements'].should eq []
5
+ context YandexMystem::Simple do
6
+ it "should stem words" do
7
+ data = YandexMystem::Simple.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
8
+ data['мальчики'].should eq %w(мальчик)
9
+ data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
10
+ data['девочки'].should eq %w(девочка)
11
+ data['девочек'].should eq %w(девочка)
12
+ data['сов'].should eq %w(сова)
13
+ data['пошли'].should eq %w(пойти посылать)
14
+ data['elements'].should eq []
15
+ end
14
16
  end
15
- end
17
+
18
+ context YandexMystem::Extended do
19
+ it 'latin words' do
20
+ words = YandexMystem::Extended.stem('elements')
21
+
22
+ words.size.should eq 0
23
+
24
+ words['elements'].should be_nil
25
+ end
26
+
27
+ it 'multiple definitions' do
28
+ words = YandexMystem::Extended.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
29
+
30
+ words.size.should eq 11
31
+
32
+ words['девочка'].first.word.should eq 'девочка'
33
+ words['девочка'].first.frequency.should eq 185.2
34
+ words['девочка'].first.part.should eq 'S'
35
+
36
+
37
+ words['мальчиков'].size.should eq 3
38
+ end
39
+
40
+ it 'sort by frequency' do
41
+ words = YandexMystem::Extended.stem('сосланный')
42
+
43
+ words['сосланный'].size.should eq 2
44
+
45
+ words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
46
+ words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
47
+ words['сосланный'].map(&:part).should eq %w(V S)
48
+ end
49
+
50
+ it 'set multiple times the same word, but lowercase and uppercase is different' do
51
+ YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
52
+ YandexMystem::Extended.stem('В в в в')
53
+ end
54
+ end
55
+ end
@@ -16,4 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.version = YandexMystem::VERSION
17
17
 
18
18
  gem.add_development_dependency "rspec", '~> 2.8'
19
+ gem.add_development_dependency "rake", '~> 10.1'
19
20
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yandex_mystem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Dmitry Polushkin
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-06-28 00:00:00.000000000 Z
11
+ date: 2013-08-04 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rspec
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ~>
20
18
  - !ruby/object:Gem::Version
@@ -22,11 +20,24 @@ dependencies:
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ~>
28
25
  - !ruby/object:Gem::Version
29
26
  version: '2.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.1'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.1'
30
41
  description: Mystem is a software that provided by the Yandex only for non-commercial
31
42
  project. With use of it you can detect base forms of the words in a text, make a
32
43
  simple morphological analysis of russian words.
@@ -39,6 +50,7 @@ files:
39
50
  - .gitignore
40
51
  - .rspec
41
52
  - .rvmrc
53
+ - .travis.yml
42
54
  - Gemfile
43
55
  - README.md
44
56
  - Rakefile
@@ -53,26 +65,25 @@ files:
53
65
  - yandex_mystem.gemspec
54
66
  homepage: ''
55
67
  licenses: []
68
+ metadata: {}
56
69
  post_install_message:
57
70
  rdoc_options: []
58
71
  require_paths:
59
72
  - lib
60
73
  required_ruby_version: !ruby/object:Gem::Requirement
61
- none: false
62
74
  requirements:
63
- - - ! '>='
75
+ - - '>='
64
76
  - !ruby/object:Gem::Version
65
77
  version: '0'
66
78
  required_rubygems_version: !ruby/object:Gem::Requirement
67
- none: false
68
79
  requirements:
69
- - - ! '>='
80
+ - - '>='
70
81
  - !ruby/object:Gem::Version
71
82
  version: '0'
72
83
  requirements: []
73
84
  rubyforge_project:
74
- rubygems_version: 1.8.24
85
+ rubygems_version: 2.0.3
75
86
  signing_key:
76
- specification_version: 3
87
+ specification_version: 4
77
88
  summary: Yandex Mystem makes morphological analysis of a russian text
78
89
  test_files: []