yandex_mystem 0.1.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a6c7e0e979e518cf69b18065ac82fef6341fa401
4
- data.tar.gz: d21da1203a6bff374411f82ed2e8d044cf88e821
3
+ metadata.gz: d7c1c4fcbbbafd601e0457db5366a8d82639955d
4
+ data.tar.gz: 3da18f1e83a23747a51786fcda68b5ccd109dfb2
5
5
  SHA512:
6
- metadata.gz: 79d31c75be6137c8d4fa5cc451fd4ed54f1fe76fb001460b10d4d33d8c1530e1a154f20979de116f51a5cf940a594d1829682bca9594bd61bfa9a624397a35ee
7
- data.tar.gz: edb8e0a6d1cec46d4f5a5db9412a57ce9698808685e1bea2990c2a44e53497f30442bb8fd82dee8f73e71cae497cf12fab66515d4eab3b3dd099404fc22da88b
6
+ metadata.gz: fdca784ffa0621fc345ffa080e00131c43db0ac86f14012082271ed378a1cfa32ff1145acfecb77ef2d336e8f6a8ed4e976375b81ed3fbedbfd94aaf7213ba97
7
+ data.tar.gz: 6fa8de5e45b069137e46165d6da1cd7d0209b297e85de996a30340d1171cf8d85eb1194787f0fa40bae12fbb4ccd92ecef0601462cfdc3e2e6664dbf9ff8f234
data/README.md CHANGED
@@ -3,6 +3,12 @@
3
3
  [![Build Status](https://secure.travis-ci.org/dmitry/yandex_mystem.png?branch=master)](http://travis-ci.org/dmitry/yandex_mystem) [![Gem Version](https://badge.fury.io/rb/yandex_mystem.png)](http://badge.fury.io/rb/yandex_mystem) [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dmitry/yandex_mystem/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
4
4
 
5
5
 
6
+ ## Version 3.0 not compatible with previous
7
+
8
+ Mystem 3.0 support JSON format, so we should use this option.
9
+ This gem (yandex_mystem) now returns information in JSON-like ruby native format (array of hashes).
10
+ More info about Mystem changes: http://api.yandex.ru/mystem/downloads/
11
+
6
12
  ## Introduction
7
13
 
8
14
  Mystem is a software that provided by the Yandex only for non-commercial project. With use of it you can detect base forms of the words in a text, make a simple morphological analysis of russian words.
@@ -13,7 +19,7 @@ Mystem is a software that provided by the Yandex only for non-commercial project
13
19
 
14
20
  ## License
15
21
 
16
- First of all, read license on http://company.yandex.ru/technologies/mystem/
22
+ First of all, read license on http://api.yandex.ru/mystem/
17
23
 
18
24
  `Mystem` available only for non-commercial usage.
19
25
 
@@ -21,14 +27,14 @@ First of all, read license on http://company.yandex.ru/technologies/mystem/
21
27
 
22
28
  This gem contains executables for there platforms:
23
29
 
24
- * Windows
25
- * Linux 2.6 32-bit
26
- * Linux 2.6 64-bit
27
- * Mac OS X 10.5
30
+ * Windows 7 32-bit
31
+ * Linux 3.5 32-bit
32
+ * Linux 3.1 64-bit
33
+ * Freebsd 9.0 64-bit
28
34
 
29
- ...of six, FreeBSD not in the gem. If you need it, add pull request or issue.
35
+ Note: Mystem 3.0 not support Mac OS X, sorry.
30
36
 
31
37
  ## Usage
32
38
 
33
39
  YandexMystem::Simple.stem 'О предложении в котором много слов.'
34
- YandexMystem::Extended.stem 'нет сов'
40
+ YandexMystem::Raw.stem 'нет сов'
data/bin/mystem-bsd ADDED
Binary file
Binary file
Binary file
Binary file
@@ -1,3 +1,3 @@
1
1
  module YandexMystem
2
- VERSION = '0.1.1'
2
+ VERSION = '3.0.0'
3
3
  end
data/lib/yandex_mystem.rb CHANGED
@@ -1,10 +1,12 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'open3'
2
4
  require 'yandex_mystem/version'
3
5
  require 'pathname'
6
+ require 'json'
4
7
 
5
8
  module YandexMystem
6
- class Base
7
- WORD_SCANNER_REGEXP = /^([^\{]+)\{(.+)\}$/.freeze
9
+ class Base
8
10
 
9
11
  def self.stem(text)
10
12
  exec = [command, self::ARGUMENTS].join(' ')
@@ -20,7 +22,7 @@ module YandexMystem
20
22
 
21
23
  def self.command
22
24
  @command ||= begin
23
- path = Pathname.new(__FILE__) + '../../app/'
25
+ path = Pathname.new(__FILE__) + '../../bin/'
24
26
  path + "mystem-#{command_postfix}"
25
27
  end
26
28
  end
@@ -34,9 +36,9 @@ module YandexMystem
34
36
  when /64.+linux$/
35
37
  'linux-64'
36
38
  when /darwin/
37
- 'mac'
39
+ raise 'Mystem 3.0 does not support Max OS X.'
38
40
  when /freebsd/
39
- raise 'Create an issue or add pull request on a github.'
41
+ 'bsd'
40
42
  else
41
43
  raise 'Unknown OS'
42
44
  end
@@ -44,54 +46,19 @@ module YandexMystem
44
46
  end
45
47
 
46
48
  class Simple < Base
47
- ARGUMENTS = '-e utf-8 -n'
48
-
49
- NOT_INCLUDE_REGEXP = /.+\?\?$/.freeze
49
+ ARGUMENTS = '-e utf-8 -n --format json'
50
50
 
51
51
 
52
52
  def self.parse(data)
53
- parsed = data.scan(WORD_SCANNER_REGEXP).map do |(word, words)|
54
- words = words.split('|').select do |w|
55
- !(w =~ NOT_INCLUDE_REGEXP)
56
- end
57
-
58
- [word, words]
59
- end.flatten(1)
60
-
61
- Hash[*parsed]
53
+ Hash[ JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true).inject([]){|s, h| s + [[ h[:text], h[:analysis].map{|a| a[:lex]} ]]} ]
62
54
  end
63
55
  end
64
56
 
65
- class Extended < Base
66
- ARGUMENTS = '-e utf-8 -nifg'
67
-
68
- REGEXP = /([^\|:]+):([0-9\.]+)=([A-Z]+)/
69
-
70
- Word = Struct.new(:word, :frequency, :part)
57
+ class Raw < Base
58
+ ARGUMENTS = '-e utf-8 -ig -n --weight --format json --eng-gr'
71
59
 
72
60
  def self.parse(data)
73
- parsed = {}
74
-
75
- data.scan(WORD_SCANNER_REGEXP).each do |(word, words)|
76
- unless parsed.key?(word)
77
- words = words.scan(REGEXP).map do |w|
78
- to_word(w)
79
- end
80
-
81
- unless words.size.zero?
82
- parsed[word] = words.sort_by(&:frequency).reverse
83
- end
84
- end
85
- end
86
-
87
- parsed
88
- end
89
-
90
- private
91
-
92
- def self.to_word(w)
93
- word, frequency, part = w
94
- Word.new(word, frequency.to_f, part)
95
- end
61
+ JSON.parse('[' + data.split("\n").join(",") + ']', :symbolize_names => true)
62
+ end
96
63
  end
97
64
  end
@@ -4,7 +4,28 @@ require 'spec_helper'
4
4
  describe YandexMystem do
5
5
  context YandexMystem::Simple do
6
6
  it "should stem words" do
7
- data = YandexMystem::Simple.stem('мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements')
7
+ data = YandexMystem::Simple.stem("мальчики мальчиков девочки девочек компьютеров компьютере сов пошли elements")
8
+ data['мальчики'].should eq %w(мальчик)
9
+ data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
10
+ data['девочки'].should eq %w(девочка)
11
+ data['девочек'].should eq %w(девочка)
12
+ data['сов'].should eq %w(сова)
13
+ data['пошли'].should eq %w(пойти посылать)
14
+ data['elements'].should eq []
15
+ end
16
+
17
+ it "should stem words in few lines" do
18
+ data = YandexMystem::Simple.stem(%[
19
+
20
+
21
+ мальчики
22
+ мальчиков
23
+ девочки девочек компьютеров компьютере сов пошли
24
+ elements
25
+
26
+
27
+
28
+ ])
8
29
  data['мальчики'].should eq %w(мальчик)
9
30
  data['мальчиков'].should eq %w(мальчик мальчиков мальчиковый)
10
31
  data['девочки'].should eq %w(девочка)
@@ -15,41 +36,32 @@ describe YandexMystem do
15
36
  end
16
37
  end
17
38
 
18
- context YandexMystem::Extended do
39
+ context YandexMystem::Raw do
19
40
  it 'latin words' do
20
- words = YandexMystem::Extended.stem('elements')
21
-
22
- words.size.should eq 0
41
+ response = YandexMystem::Raw.stem('Elements')
23
42
 
24
- words['elements'].should be_nil
43
+ response.size.should eq 1
44
+ response[0][:analysis].should eq []
45
+ response[0][:text].should eq 'Elements'
25
46
  end
26
47
 
27
48
  it 'multiple definitions' do
28
- words = YandexMystem::Extended.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
49
+ response = YandexMystem::Raw.stem('девочка мальчиков пошла к комьютерам искать, в прочем, как и всегда')
29
50
 
30
- words.size.should eq 11
51
+ response.size.should eq 11
31
52
 
32
- words['девочка'].first.word.should eq 'девочка'
33
- words['девочка'].first.frequency.should eq 185.2
34
- words['девочка'].first.part.should eq 'S'
53
+ response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:lex].should eq 'девочка'
54
+ response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:wt].should eq 1
55
+ response.find_all{|h| h[:text] == 'девочка'}.first[:analysis].first[:gr].should eq 'S,f,anim=nom,sg'
35
56
 
36
-
37
- words['мальчиков'].size.should eq 3
57
+ response.find_all{|h| h[:text] == 'мальчиков'}.first[:analysis].size.should eq 3
38
58
  end
39
59
 
40
- it 'sort by frequency' do
41
- words = YandexMystem::Extended.stem('сосланный')
42
-
43
- words['сосланный'].size.should eq 2
44
-
45
- words['сосланный'].map(&:word).should eq %w(ссылать сосланный)
46
- words['сосланный'].map(&:frequency).should eq [1.2, 0.5]
47
- words['сосланный'].map(&:part).should eq %w(V S)
60
+ it 'get geo name' do
61
+ response = YandexMystem::Raw.stem('Москва')
62
+ response.find_all{|h| h[:text] == 'Москва'}.first[:analysis].first[:gr].should include 'geo'
48
63
  end
49
64
 
50
- it 'set multiple times the same word, but lowercase and uppercase is different' do
51
- YandexMystem::Extended.should_receive(:to_word).exactly(4).times.and_return(YandexMystem::Extended.const_get(:Word).new('в', 2, 'S'))
52
- YandexMystem::Extended.stem('В в в в')
53
- end
65
+
54
66
  end
55
67
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yandex_mystem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Polushkin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-28 00:00:00.000000000 Z
11
+ date: 2014-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -43,21 +43,24 @@ description: Mystem is a software that provided by the Yandex only for non-comme
43
43
  simple morphological analysis of russian words.
44
44
  email:
45
45
  - dmitry.polushkin@gmail.com
46
- executables: []
46
+ executables:
47
+ - mystem-bsd
48
+ - mystem-linux-32
49
+ - mystem-linux-64
50
+ - mystem-win.exe
47
51
  extensions: []
48
52
  extra_rdoc_files: []
49
53
  files:
50
54
  - ".gitignore"
51
55
  - ".rspec"
52
- - ".rvmrc"
53
56
  - ".travis.yml"
54
57
  - Gemfile
55
58
  - README.md
56
59
  - Rakefile
57
- - app/mystem-linux-32
58
- - app/mystem-linux-64
59
- - app/mystem-mac
60
- - app/mystem-win.exe
60
+ - bin/mystem-bsd
61
+ - bin/mystem-linux-32
62
+ - bin/mystem-linux-64
63
+ - bin/mystem-win.exe
61
64
  - lib/yandex_mystem.rb
62
65
  - lib/yandex_mystem/version.rb
63
66
  - spec/spec_helper.rb
data/.rvmrc DELETED
@@ -1 +0,0 @@
1
- rvm use 2.0.0 --create
data/app/mystem-linux-32 DELETED
Binary file
data/app/mystem-linux-64 DELETED
Binary file
data/app/mystem-mac DELETED
Binary file
data/app/mystem-win.exe DELETED
Binary file