yandex_mystem 0.1.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a6c7e0e979e518cf69b18065ac82fef6341fa401
4
- data.tar.gz: d21da1203a6bff374411f82ed2e8d044cf88e821
3
+ metadata.gz: d7c1c4fcbbbafd601e0457db5366a8d82639955d
4
+ data.tar.gz: 3da18f1e83a23747a51786fcda68b5ccd109dfb2
5
5
  SHA512:
6
- metadata.gz: 79d31c75be6137c8d4fa5cc451fd4ed54f1fe76fb001460b10d4d33d8c1530e1a154f20979de116f51a5cf940a594d1829682bca9594bd61bfa9a624397a35ee
7
- data.tar.gz: edb8e0a6d1cec46d4f5a5db9412a57ce9698808685e1bea2990c2a44e53497f30442bb8fd82dee8f73e71cae497cf12fab66515d4eab3b3dd099404fc22da88b
6
+ metadata.gz: fdca784ffa0621fc345ffa080e00131c43db0ac86f14012082271ed378a1cfa32ff1145acfecb77ef2d336e8f6a8ed4e976375b81ed3fbedbfd94aaf7213ba97
7
+ data.tar.gz: 6fa8de5e45b069137e46165d6da1cd7d0209b297e85de996a30340d1171cf8d85eb1194787f0fa40bae12fbb4ccd92ecef0601462cfdc3e2e6664dbf9ff8f234
data/README.md CHANGED
@@ -3,6 +3,12 @@
3
3
  [![Build Status](https://secure.travis-ci.org/dmitry/yandex_mystem.png?branch=master)](http://travis-ci.org/dmitry/yandex_mystem) [![Gem Version](https://badge.fury.io/rb/yandex_mystem.png)](http://badge.fury.io/rb/yandex_mystem) [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dmitry/yandex_mystem/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
4
4
 
5
5
 
6
+ ## Version 3.0 not compatible with previous
7
+
8
+ Mystem 3.0 support JSON format, so we should use this option.
9
+ This gem (yandex_mystem) now returns information in JSON-like ruby native format (array of hashes).
10
+ More info about Mystem changes: http://api.yandex.ru/mystem/downloads/
11
+
6
12
  ## Introduction
7
13
 
8
14
  Mystem is a software that provided by the Yandex only for non-commercial project. With use of it you can detect base forms of the words in a text, make a simple morphological analysis of russian words.
@@ -13,7 +19,7 @@ Mystem is a software that provided by the Yandex only for non-commercial project
13
19
 
14
20
  ## License
15
21
 
16
- First of all, read license on http://company.yandex.ru/technologies/mystem/
22
+ First of all, read license on http://api.yandex.ru/mystem/
17
23
 
18
24
  `Mystem` available only for non-commercial usage.
19
25
 
@@ -21,14 +27,14 @@ First of all, read license on http://company.yandex.ru/technologies/mystem/
21
27
 
22
28
  This gem contains executables for there platforms:
23
29
 
24
- * Windows
25
- * Linux 2.6 32-bit
26
- * Linux 2.6 64-bit
27
- * Mac OS X 10.5
30
+ * Windows 7 32-bit
31
+ * Linux 3.5 32-bit
32
+ * Linux 3.1 64-bit
33
+ * Freebsd 9.0 64-bit
28
34
 
29
- ...of six, FreeBSD not in the gem. If you need it, add pull request or issue.
35
+ Note: Mystem 3.0 not support Mac OS X, sorry.
30
36
 
31
37
  ## Usage
32
38
 
33
39
  YandexMystem::Simple.stem 'О предложении в котором много слов.'
34
- YandexMystem::Extended.stem 'нет сов'
40
+ YandexMystem::Raw.stem 'нет сов'
data/bin/mystem-bsd ADDED
Binary file
Binary file
Binary file
Binary file
@@ -1,3 +1,3 @@
1
1
  module YandexMystem
2
- VERSION = '0.1.1'
2
+ VERSION = '3.0.0'
3
3
  end
data/lib/yandex_mystem.rb CHANGED
@@ -1,10 +1,12 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'open3'
2
4
  require 'yandex_mystem/version'
3
5
  require 'pathname'
6
+ require 'json'
4
7
 
5
8
  module YandexMystem
6
- class Base
7
- WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
9
+ class Base
8
10
 
9
11
  def self.stem(text)
10
12
  exec = [command, self::ARGUMENTS].join(' ')
@@ -20,7 +22,7 @@ module YandexMystem
20
22
 
21
23
  def self.command
22
24
  @command ||= begin
23
- path = Pathname.new(__FILE__) + '../../app/'
25
+ path = Pathname.new(__FILE__) + '../../bin/'
24
26
  path + "mystem-#{command_postfix}"
25
27
  end
26
28
  end
@@ -34,9 +36,9 @@ module YandexMystem
34
36
  when /64.+linux$/
35
37
  'linux-64'
36
38
  when /darwin/
37
- 'mac'
39
+ raise 'Mystem 3.0 does not support Max OS X.'
38
40
  when /freebsd/
39
- raise 'Create an issue or add pull request on a github.'
41
+ 'bsd'
40
42
  else
41
43
  raise 'Unknown OS'
42
44
  end
@@ -44,54 +46,19 @@ module YandexMystem
44
46
  end
45
47
 
46
48
  class Simple < Base
47
- ARGUMENTS = '-e utf-8 -n'
48
-
49
- NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
49
+ ARGUMENTS = '-e utf-8 -n --format json'
50
50
 
51
51
 
52
52
  def self.parse(data)
53
- parsed = data.scan(WORD_SCANNER_REGEXP).map do |(word, words)|
54
- words = words.split('|').select do |w|
55
- !(w =~ NOT_INCLUDE_REGEXP)
56
- end
57
-
58
- [word, words]
59
- end.flatten(1)
60
-
61
- Hash[*parsed]
53
+ Hash[ JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true).inject([]){|s, h| s + [[ h[:text], h[:analysis].map{|a| a[:lex]} ]]} ]
62
54
  end
63
55
  end
64
56
 
65
- class Extended < Base
66
- ARGUMENTS = '-e utf-8 -nifg'
67
-
68
- REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
69
-
70
- Word = Struct.new(:word, :frequency, :part)
57
+ class Raw < Base
58
+ ARGUMENTS = '-e utf-8 -ig -n --weight --format json --eng-gr'
71
59
 
72
60
  def self.parse(data)
73
- parsed = {}
74
-
75
- data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
76
- unless parsed.key?(word)
77
- words = words.scan(REGEXP).map do |w|
78
- to_word(w)
79
- end
80
-
81
- unless words.size.zero?
82
- parsed[word] = words.sort_by(&:frequency).reverse
83
- end
84
- end
85
- end
86
-
87
- parsed
88
- end
89
-
90
- private
91
-
92
- def self.to_word(w)
93
- word, frequency, part = w
94
- Word.new(word, frequency.to_f, part)
95
- end
61
+ JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true)
62
+ end
96
63
  end
97
64
  end
@@ -4,7 +4,28 @@ require 'spec_helper'
4
4
  describe YandexMystem do
5
5
  context YandexMystem::Simple do
6
6
  it "should stem words" do
7
- data = YandexMystem::Simple.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
7
+ data = YandexMystem::Simple.stem("мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements")
8
+ data['мальчики'].should eq %w(мальчик)
9
+ data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
10
+ data['девочки'].should eq %w(девочка)
11
+ data['девочек'].should eq %w(девочка)
12
+ data['сов'].should eq %w(сова)
13
+ data['пошли'].should eq %w(пойти посылать)
14
+ data['elements'].should eq []
15
+ end
16
+
17
+ it "should stem words in few lines" do
18
+ data = YandexMystem::Simple.stem(%[
19
+
20
+
21
+ мальчики
22
+ мальчиков
23
+ девочки девочек компьютеров компьютере сов пошли
24
+ elements
25
+
26
+
27
+
28
+ ])
8
29
  data['мальчики'].should eq %w(мальчик)
9
30
  data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
10
31
  data['девочки'].should eq %w(девочка)
@@ -15,41 +36,32 @@ describe YandexMystem do
15
36
  end
16
37
  end
17
38
 
18
- context YandexMystem::Extended do
39
+ context YandexMystem::Raw do
19
40
  it 'latin words' do
20
- words = YandexMystem::Extended.stem('elements')
21
-
22
- words.size.should eq 0
41
+ response = YandexMystem::Raw.stem('Elements')
23
42
 
24
- words['elements'].should be_nil
43
+ response.size.should eq 1
44
+ response[0][:analysis].should eq []
45
+ response[0][:text].should eq 'Elements'
25
46
  end
26
47
 
27
48
  it 'multiple definitions' do
28
- words = YandexMystem::Extended.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
49
+ response = YandexMystem::Raw.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
29
50
 
30
- words.size.should eq 11
51
+ response.size.should eq 11
31
52
 
32
- words['девочка'].first.word.should eq 'девочка'
33
- words['девочка'].first.frequency.should eq 185.2
34
- words['девочка'].first.part.should eq 'S'
53
+ response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:lex].should eq 'девочка'
54
+ response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:wt].should eq 1
55
+ response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:gr].should eq 'S,f,anim=nom,sg'
35
56
 
36
-
37
- words['мальчиков'].size.should eq 3
57
+ response.find_all{|h| h[:text] == 'мальчиков'}.first[:analysis].size.should eq 3
38
58
  end
39
59
 
40
- it 'sort by frequency' do
41
- words = YandexMystem::Extended.stem('сосланный')
42
-
43
- words['сосланный'].size.should eq 2
44
-
45
- words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
46
- words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
47
- words['сосланный'].map(&:part).should eq %w(V S)
60
+ it 'get geo name' do
61
+ response = YandexMystem::Raw.stem('Москва')
62
+ response.find_all{|h| h[:text] == 'Москва'}.first[:analysis].first[:gr].should include 'geo'
48
63
  end
49
64
 
50
- it 'set multiple times the same word, but lowercase and uppercase is different' do
51
- YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
52
- YandexMystem::Extended.stem('В в в в')
53
- end
65
+
54
66
  end
55
67
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yandex_mystem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Polushkin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-28 00:00:00.000000000 Z
11
+ date: 2014-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -43,21 +43,24 @@ description: Mystem is a software that provided by the Yandex only for non-comme
43
43
  simple morphological analysis of russian words.
44
44
  email:
45
45
  - dmitry.polushkin@gmail.com
46
- executables: []
46
+ executables:
47
+ - mystem-bsd
48
+ - mystem-linux-32
49
+ - mystem-linux-64
50
+ - mystem-win.exe
47
51
  extensions: []
48
52
  extra_rdoc_files: []
49
53
  files:
50
54
  - ".gitignore"
51
55
  - ".rspec"
52
- - ".rvmrc"
53
56
  - ".travis.yml"
54
57
  - Gemfile
55
58
  - README.md
56
59
  - Rakefile
57
- - app/mystem-linux-32
58
- - app/mystem-linux-64
59
- - app/mystem-mac
60
- - app/mystem-win.exe
60
+ - bin/mystem-bsd
61
+ - bin/mystem-linux-32
62
+ - bin/mystem-linux-64
63
+ - bin/mystem-win.exe
61
64
  - lib/yandex_mystem.rb
62
65
  - lib/yandex_mystem/version.rb
63
66
  - spec/spec_helper.rb
data/.rvmrc DELETED
@@ -1 +0,0 @@
1
- rvm use 2.0.0 --create
data/app/mystem-linux-32 DELETED
Binary file
data/app/mystem-linux-64 DELETED
Binary file
data/app/mystem-mac DELETED
Binary file
data/app/mystem-win.exe DELETED
Binary file