mormor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +29 -0
- data/README.md +77 -0
- data/exe/mormor-dump +7 -0
- data/lib/mormor.rb +11 -0
- data/lib/mormor/dictionary.rb +128 -0
- data/lib/mormor/fsa.rb +96 -0
- data/lib/mormor/fsa/cfsa2.rb +118 -0
- data/lib/mormor/fsa/enumerator.rb +66 -0
- data/lib/mormor/fsa/fsa5.rb +100 -0
- metadata +123 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2293b20144e224eff386a73bc985a03e360dcd9a73afcde01af33ec0b0bc9d32
|
4
|
+
data.tar.gz: 2e0e577ee7925c5cdc453870152b8d30698371b9c31f65dc1aa0bff3c0475e4a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 63f3bb8643b13a296a8fc3dfecab97dd88f948660b9aed395d9264066edd666137c321a35efeeeda2640099d98641784064495039fa9e3a3339102a2ea6f04ed
|
7
|
+
data.tar.gz: 3c42a83dfcfbfe52348559d8171ea7d10a5bb5fa4087d90824b9e82d07057cfb9748e5f55e9a586561726106c6e37a25b8ec46d67cd4bf1872118a5704c54beb
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
BSD 3-Clause License
|
2
|
+
|
3
|
+
Copyright (c) 2019, Victor Shepelev
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
8
|
+
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
10
|
+
list of conditions and the following disclaimer.
|
11
|
+
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
14
|
+
and/or other materials provided with the distribution.
|
15
|
+
|
16
|
+
3. Neither the name of the copyright holder nor the names of its
|
17
|
+
contributors may be used to endorse or promote products derived from
|
18
|
+
this software without specific prior written permission.
|
19
|
+
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
# MorMor
|
2
|
+
|
3
|
+
[](http://badge.fury.io/rb/mormor)
|
4
|
+
|
5
|
+
**MorMor** is pure Ruby [morfologik](https://github.com/morfologik/morfologik-stemming) dictionary client that could be used for POS (part of speech) tagging and simplistic spellchecking. _Morfologik_ format's distinguishing feature is it is primary dictionary format for [LanguageTool](https://github.com/languagetool-org/languagetool), therefore a lot of ready high-quality dictionaries exist.
|
6
|
+
|
7
|
+
## Features/Problems
|
8
|
+
|
9
|
+
* **No dependencies¹, pure Ruby**
|
10
|
+
* **Fast**: I don't have any detailed numbers, but naive test on my laptop shows 3 mln lookups/second on a very large dictionary (Polish, several million word forms).
|
11
|
+
* Relatively **memory-efficient**: Typical dictionary file size is 1-3 Mb, mormor just loads it into memory as bytes (e.g. each byte => Ruby Integer) and that's all memory it needs.
|
12
|
+
* **Dictionaries** for a lot of languages already exist: unlike your typical POS tagger, usage instructions does not start with "First, take your corpora and train the tagger as you please" (see "Dictionaries" section).
|
13
|
+
* To the moment, it is just a **naive** port of original Morfologik Java code, but it works with all the dictionaries I could find:
|
14
|
+
* Of possible dictionary formats, only FSA5 and CFSA2 are implemented (not CFSA);
|
15
|
+
* Of possible dictionary "encoders", only "SUFFIX" and "PREFIX" are implemented;
|
16
|
+
* No tests/specs, but it works (and checked thoroughly with existing dictionaries); TBH, original Morfologik doesn't have much, either;
|
17
|
+
* Morfologik's spellchecker suggestions/candidates are **not** ported, so mormor can be used only for "sanity" spellchecking ("this word is/is not in the dictionary")
|
18
|
+
|
19
|
+
<small>¹The only runtime dependency is [backports](https://github.com/marcandre/backports) and that's only because I am too fond of modern Ruby features to sacrifice them to "no-dependencies" god.</small>
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
0. Install `mormor` gem (via bundler or just `[sudo] gem install mormor`)
|
24
|
+
1. Take a dictionary for your language (see "Dictionaries" section below)
|
25
|
+
2. Now...
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
require 'mormor'
|
29
|
+
|
30
|
+
dictionary = MorMor::Dictionary.new('path/to/english')
|
31
|
+
dictionary.lookup('meowing')
|
32
|
+
# => [#<struct MorMor::Dictionary::Word stem="meow", tags="VBG">]
|
33
|
+
dictionary.lookup('barks')
|
34
|
+
# => [#<struct MorMor::Dictionary::Word stem="bark", tags="NNS">,
|
35
|
+
# #<struct MorMor::Dictionary::Word stem="bark", tags="VBZ">]
|
36
|
+
dictionary.lookup('borogoves')
|
37
|
+
# = nil
|
38
|
+
|
39
|
+
dictionary = MorMor::Dictionary.new('path/to/ukrainian')
|
40
|
+
dictionary.lookup("солов'їна")
|
41
|
+
# => [#<struct MorMor::Dictionary::Word stem="солов'їний", tags="adj:f:v_kly">,
|
42
|
+
# #<struct MorMor::Dictionary::Word stem="солов'їний", tags="adj:f:v_naz">]
|
43
|
+
```
|
44
|
+
|
45
|
+
`Dictionary#lookup` returns an array of structs which describe all possible base forms + part of speech /word form tags. (For example, "barks" could be a third person form of the verb "to bark", or plural form of noun "bark".)
|
46
|
+
|
47
|
+
Tags are dependent on the particular dictionary used and typically documented in a free form alongside the dictionaries.
|
48
|
+
|
49
|
+
## Dictionaries
|
50
|
+
|
51
|
+
A lot of dictionaries in Morfologik format could be found at [LanguageTool's repo](https://github.com/languagetool-org/languagetool). For example, for Polish language, [dictionary is at](https://github.com/languagetool-org/languagetool/tree/master/languagetool-language-modules/pl/src/main/resources/org/languagetool/resource/pl) `languagetool-language-modules/pl/src/main/resources/org/languagetool/resource/pl/`.
|
52
|
+
|
53
|
+
What you need there, are:
|
54
|
+
* `polish.dict` is a dictionary (binary finite-state-automata) itself
|
55
|
+
* `polish.info` is dictionary metadata
|
56
|
+
|
57
|
+
In order to use Polish dictionary with mormor, you need to place both files at the same folder, and then
|
58
|
+
```ruby
|
59
|
+
pl = MorMor::Dictionary.new('path/to/that/folder/polish') # without extension
|
60
|
+
pl.lookup('świetnie')
|
61
|
+
```
|
62
|
+
|
63
|
+
You may also be interested in `tagset.txt` file of the same folder, which has an explanation for all POS/forms tags in natural language (Polish language, for that case).
|
64
|
+
|
65
|
+
Sometimes (for example, in case of German and Ukrainian), LanguageTool repo contains not the dictionary itself, but a link to other repo/site where it can be downloaded.
|
66
|
+
|
67
|
+
Please **carefully consider** dictionary licenses when using them!
|
68
|
+
|
69
|
+
> **Note:** mormor repo contains copies of dictionary files from LanguageTool and referred projects, but they are **not** a part of the gem distribution and only used for testing the parser/lookup correctness, and demonstration purposes.
|
70
|
+
|
71
|
+
## License and credits
|
72
|
+
|
73
|
+
Most of the credit for algorithms and original code belong to original [Morfologik's](https://github.com/morfologik/morfologik-stemming) authors, and author of paper's they based their work on.
|
74
|
+
|
75
|
+
Ruby version is done by [Victor Shepelev](https://zverok.github.io).
|
76
|
+
|
77
|
+
The license is BSD, the same as the original Morfologik.
|
data/exe/mormor-dump
ADDED
data/lib/mormor.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'backports/2.6.0/kernel/then'
|
4
|
+
require 'backports/2.5.0/integer' # allbits? / anybits? / nobits?
|
5
|
+
|
6
|
+
# Morfologik dictionary client
|
7
|
+
# See {Dictionary}.
|
8
|
+
module MorMor
|
9
|
+
end
|
10
|
+
|
11
|
+
require_relative 'mormor/dictionary'
|
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'fsa'
|
4
|
+
|
5
|
+
module MorMor
|
6
|
+
# Morfologik dictionary client.
|
7
|
+
#
|
8
|
+
# @example
|
9
|
+
# dictionary = MorMor::Dictionary.new('path/to/english')
|
10
|
+
# dictionary.lookup('meowing')
|
11
|
+
# # => [#<struct MorMor::Dictionary::Word stem="meow", tags="VBG">]
|
12
|
+
#
|
13
|
+
class Dictionary
|
14
|
+
# This class is simplified port of all `Dictionary*.java` classes (Dictionary, DictionaryMetadata,
|
15
|
+
# DictionaryLookup etc) of `morfologik-stemming` package.
|
16
|
+
# See original package to understand details and stuff:
|
17
|
+
# https://github.com/morfologik/morfologik-stemming/tree/master/morfologik-stemming/src/main/java/morfologik/stemming
|
18
|
+
|
19
|
+
# Result of {Dictionary#lookup}
|
20
|
+
#
|
21
|
+
# `stem` is base form of the looked up word, `tags` is dictionary-depended part of speech / word
|
22
|
+
# form tags.
|
23
|
+
Word = Struct.new(:stem, :tags)
|
24
|
+
|
25
|
+
# @private
|
26
|
+
DECODERS = {'SUFFIX' => :suffix, 'PREFIX' => :prefix_suffix}.freeze
|
27
|
+
|
28
|
+
# @private
|
29
|
+
attr_reader :fsa
|
30
|
+
# @return [Hash]
|
31
|
+
attr_reader :info
|
32
|
+
|
33
|
+
# @param path [String] Path to dictionary files. It is expected that `path + ".info"` and
|
34
|
+
# `path + ".dict"` files are existing and contain Morfologik dictionary
|
35
|
+
def initialize(path)
|
36
|
+
@path = path # Just for inspect
|
37
|
+
|
38
|
+
read_info(path + '.info')
|
39
|
+
|
40
|
+
@fsa = FSA.read(path + '.dict')
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [String]
|
44
|
+
def inspect
|
45
|
+
'#<%s %s>' % [self.class, @path]
|
46
|
+
end
|
47
|
+
|
48
|
+
# Finds all forms and POS tags of words in the dictionary.
|
49
|
+
#
|
50
|
+
# @param word [String] a word to lookup
|
51
|
+
# @return [Array<Word>, nil]
|
52
|
+
def lookup(word) # rubocop:disable Metrics/AbcSize
|
53
|
+
# Method is left unsplit to leave original algorithm (DictionaryLookup.java#lookup) recognizable,
|
54
|
+
# hence rubocop:disable
|
55
|
+
|
56
|
+
bword = word.encode(@encoding).force_encoding('ASCII-8BIT')
|
57
|
+
|
58
|
+
# TODO: there could be "input conversion pairs"
|
59
|
+
|
60
|
+
# Note: not bword.bytes, because morfologik expects signed bytes, while String#bytes
|
61
|
+
# is analog of unpack('C*'), returning unsigned
|
62
|
+
m = fsa.match(bword.unpack('c*'))
|
63
|
+
|
64
|
+
# OC: this case is somewhat confusing: we should have hit the separator
|
65
|
+
# first... I don't really know how to deal with it at the time
|
66
|
+
# being.
|
67
|
+
return unless m.kind == :sequence_is_a_prefix
|
68
|
+
|
69
|
+
# OC: The entire sequence exists in the dictionary. A separator should
|
70
|
+
# be the next symbol.
|
71
|
+
arc = fsa.find_arc(m.node, @sepbyte)
|
72
|
+
|
73
|
+
# OC: The situation when the arc points to a final node should NEVER
|
74
|
+
# happen. After all, we want the word to have SOME base form.
|
75
|
+
return if arc.zero? || fsa.final_arc?(arc)
|
76
|
+
|
77
|
+
# OC: There is such a word in the dictionary. Return its base forms.
|
78
|
+
fsa.each_sequence(from: fsa.end_node(arc)).map do |encoded|
|
79
|
+
# TODO: there could be "output conversion pairs"
|
80
|
+
|
81
|
+
decoded = @decoder.call(bword, encoded).force_encoding(@encoding).encode('UTF-8')
|
82
|
+
|
83
|
+
Word.new(*decoded.split(@separator, 2))
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def read_info(path)
|
90
|
+
@info = read_values(path)
|
91
|
+
|
92
|
+
# NB: All possible values described in DictionaryAttribute.java
|
93
|
+
|
94
|
+
# Cache it to be quickly accessible
|
95
|
+
@encoding = @info.fetch('fsa.dict.encoding')
|
96
|
+
@separator = @info.fetch('fsa.dict.separator')
|
97
|
+
@sepbyte = @separator.bytes.first
|
98
|
+
|
99
|
+
@decoder = choose_decoder(@info.fetch('fsa.dict.encoder'))
|
100
|
+
end
|
101
|
+
|
102
|
+
def read_values(path)
|
103
|
+
File.exist?(path) or fail ArgumentError, "#{path} does not exist"
|
104
|
+
File.read(path).split("\n")
|
105
|
+
.map { |ln| ln.sub(/\#.*$/, '').strip }.reject(&:empty?)
|
106
|
+
.map { |ln| ln.split('=', 2) }
|
107
|
+
.to_h
|
108
|
+
end
|
109
|
+
|
110
|
+
def choose_decoder(name)
|
111
|
+
DECODERS.fetch(name.upcase) { fail ArgumentError, "Encoder #{name} is not supported yet" }
|
112
|
+
.then(&method(:method))
|
113
|
+
end
|
114
|
+
|
115
|
+
def suffix(source, encoded)
|
116
|
+
truncate_suf = encoded[0...1].bytes.first.-(65) & 0xff # 65 is 'A'
|
117
|
+
# TODO: If remove == 255, means "remove all"
|
118
|
+
source[0...source.size - truncate_suf] + encoded[1..-1]
|
119
|
+
end
|
120
|
+
|
121
|
+
def prefix_suffix(source, encoded)
|
122
|
+
truncate_pref, truncate_suf = encoded[0...2].bytes.first(2).map { |b| (b - 65) & 0xff } # 65 is 'A'
|
123
|
+
# TODO: If remove == 255, means "remove all"
|
124
|
+
|
125
|
+
source[truncate_pref...source.size - truncate_suf] + encoded[2..-1]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
data/lib/mormor/fsa.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'fsa/enumerator'
|
4
|
+
require_relative 'fsa/fsa5'
|
5
|
+
require_relative 'fsa/cfsa2'
|
6
|
+
|
7
|
+
module MorMor
|
8
|
+
# @private
|
9
|
+
#
|
10
|
+
# This class and its subclasses contains a loose simplified port of the whole `morfologik-fsa`
|
11
|
+
# package.
|
12
|
+
# Original source at: https://github.com/morfologik/morfologik-stemming/tree/master/morfologik-fsa/src/main/java/morfologik/fsa
|
13
|
+
#
|
14
|
+
# NB: TBH, I don't always understand deeply what am I doing here. Just ported Java algorithms
|
15
|
+
# statement-by-statement, then rubyfied a bit and debugged in parallel with original package to
|
16
|
+
# make sure it produces the same result.
|
17
|
+
#
|
18
|
+
# Code contains some of my comments, original implementations referred where appropriate.
|
19
|
+
# Also, in more straightforwardly ported code, original comments are left and marked with "OC:".
|
20
|
+
#
|
21
|
+
class FSA
|
22
|
+
# LanguageTool seems to use CFSA2 and FSA5, so CFSA is not implemented.
|
23
|
+
VERSIONS = {
|
24
|
+
5 => 'FSA5',
|
25
|
+
0xC5 => 'CFSA',
|
26
|
+
0xc6 => 'CFSA2'
|
27
|
+
}.freeze
|
28
|
+
|
29
|
+
Match = Struct.new(:kind, :position, :node)
|
30
|
+
|
31
|
+
class << self
|
32
|
+
def read(path)
|
33
|
+
io = File.open(path, 'rb')
|
34
|
+
io.read(4) == '\\fsa' or fail ArgumentError, 'Invalid file header, probably not an FSA.'
|
35
|
+
choose_impl(io.getbyte).new(io)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def choose_impl(version_byte)
|
41
|
+
VERSIONS
|
42
|
+
.fetch(version_byte) { fail ArgumentError 'Unsupported version byte, probably not FSA' }
|
43
|
+
.tap { |name|
|
44
|
+
constants.include?(name.to_sym) or
|
45
|
+
fail ArgumentError "Unsupported version: #{name}"
|
46
|
+
}
|
47
|
+
.then(&method(:const_get))
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def each_sequence(from: root_node, &block)
|
52
|
+
Enumerator.new(self, from).then { |e| block ? e.each(&block) : e }
|
53
|
+
end
|
54
|
+
|
55
|
+
def next_arc(arc)
|
56
|
+
last_arc?(arc) ? 0 : skip_arc(arc)
|
57
|
+
end
|
58
|
+
|
59
|
+
def each_arc(from:)
|
60
|
+
return to_enum(__method__, from: from) unless block_given?
|
61
|
+
|
62
|
+
arc = first_arc(from)
|
63
|
+
until arc.zero?
|
64
|
+
yield arc
|
65
|
+
arc = next_arc(arc)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def find_arc(node, label)
|
70
|
+
each_arc(from: node).detect { |a| arc_label(a) == label } || 0
|
71
|
+
end
|
72
|
+
|
73
|
+
# Port of FSATraversal.java
|
74
|
+
# Method is left unsplit to leave original algorithm recognizable, hence rubocop:disable's
|
75
|
+
def match(sequence, node = root_node) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity
|
76
|
+
return Match.new(:no) if node.zero?
|
77
|
+
|
78
|
+
sequence.each_with_index do |byte, i|
|
79
|
+
a = find_arc(node, byte)
|
80
|
+
|
81
|
+
case
|
82
|
+
when a.zero?
|
83
|
+
return i.zero? ? Match.new(:no, i, node) : Match.new(:automaton_has_prefix, i, node)
|
84
|
+
when i + 1 == sequence.size && final_arc?(a)
|
85
|
+
return Match.new(:exact, i, node)
|
86
|
+
when terminal_arc?(a)
|
87
|
+
return Match.new(:automaton_has_prefix, i + 1, node)
|
88
|
+
else
|
89
|
+
node = end_node(a)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
Match.new(:sequence_is_a_prefix, 0, node)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MorMor
|
4
|
+
class FSA
|
5
|
+
# Port of CFSA2.java
|
6
|
+
#
|
7
|
+
# See constant description and other docs there:
|
8
|
+
# https://github.com/morfologik/morfologik-stemming/blob/master/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java
|
9
|
+
class CFSA2 < FSA
|
10
|
+
NUMBERS = 1 << 8
|
11
|
+
BIT_TARGET_NEXT = 1 << 7
|
12
|
+
LABEL_INDEX_BITS = 5
|
13
|
+
LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1
|
14
|
+
BIT_LAST_ARC = 1 << 6
|
15
|
+
BIT_FINAL_ARC = 1 << 5
|
16
|
+
|
17
|
+
def initialize(io)
|
18
|
+
# Java's short = "network (big-endian)"
|
19
|
+
flag_bits = io.read(2).unpack('n').first # rubocop:disable Style/UnpackFirst -- doesn't work under 2.3
|
20
|
+
@numbers = flag_bits.allbits?(NUMBERS)
|
21
|
+
|
22
|
+
mapping_size = io.getbyte & 0xff
|
23
|
+
@mapping = io.read(mapping_size).unpack('c*')
|
24
|
+
|
25
|
+
@arcs = io.read.unpack('c*')
|
26
|
+
end
|
27
|
+
|
28
|
+
def root_node
|
29
|
+
destination_node_offset(first_arc(0))
|
30
|
+
end
|
31
|
+
|
32
|
+
# Navigating through arcs
|
33
|
+
def first_arc(node)
|
34
|
+
numbers? ? skip_v_int(node) : node
|
35
|
+
end
|
36
|
+
|
37
|
+
def end_node(arc)
|
38
|
+
destination_node_offset(arc)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Examining arcs
|
42
|
+
def arc_label(arc)
|
43
|
+
index = arcs[arc] & LABEL_INDEX_MASK
|
44
|
+
index.positive? ? mapping[index] : arcs[arc + 1]
|
45
|
+
end
|
46
|
+
|
47
|
+
def terminal_arc?(arc)
|
48
|
+
destination_node_offset(arc).zero?
|
49
|
+
end
|
50
|
+
|
51
|
+
def last_arc?(arc)
|
52
|
+
arcs[arc].allbits?(BIT_LAST_ARC)
|
53
|
+
end
|
54
|
+
|
55
|
+
def final_arc?(arc)
|
56
|
+
arcs[arc].allbits?(BIT_FINAL_ARC)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
attr_reader :arcs, :mapping
|
62
|
+
|
63
|
+
def numbers?
|
64
|
+
@numbers
|
65
|
+
end
|
66
|
+
|
67
|
+
def skip_v_int(offset)
|
68
|
+
offset += 1 while arcs[offset].negative?
|
69
|
+
offset + 1
|
70
|
+
end
|
71
|
+
|
72
|
+
def read_v_int(array, offset)
|
73
|
+
b = array[offset]
|
74
|
+
value = b & 0x7F
|
75
|
+
shift = 7
|
76
|
+
while b.negative?
|
77
|
+
offset += 1
|
78
|
+
b = array[offset]
|
79
|
+
value |= (b & 0x7F) << shift
|
80
|
+
shift += 7
|
81
|
+
end
|
82
|
+
|
83
|
+
value
|
84
|
+
end
|
85
|
+
|
86
|
+
def destination_node_offset(arc)
|
87
|
+
if next_set?(arc)
|
88
|
+
# OC: Follow until the last arc of this state.
|
89
|
+
arc = next_arc(arc) until last_arc?(arc)
|
90
|
+
|
91
|
+
# OC: And return the byte right after it.
|
92
|
+
skip_arc(arc)
|
93
|
+
else
|
94
|
+
# OC: The destination node address is v-coded. v-code starts either
|
95
|
+
# at the next byte (label indexed) or after the next byte (label explicit).
|
96
|
+
read_v_int(arcs, arc + (arcs[arc].anybits?(LABEL_INDEX_MASK) ? 1 : 2))
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def next_set?(arc)
|
101
|
+
arcs[arc].allbits?(BIT_TARGET_NEXT)
|
102
|
+
end
|
103
|
+
|
104
|
+
def skip_arc(offset)
|
105
|
+
flag = arcs[offset]
|
106
|
+
offset += 1
|
107
|
+
|
108
|
+
# OC: Explicit label?
|
109
|
+
offset += 1 if flag.nobits?(LABEL_INDEX_MASK)
|
110
|
+
|
111
|
+
# OC: Explicit goto?
|
112
|
+
offset = skip_v_int(offset) if flag.nobits?(BIT_TARGET_NEXT)
|
113
|
+
|
114
|
+
offset
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MorMor
|
4
|
+
class FSA
|
5
|
+
# Rubyfied port of ByteSequenceIterator.java
|
6
|
+
#
|
7
|
+
# See: https://github.com/morfologik/morfologik-stemming/blob/master/morfologik-fsa/src/main/java/morfologik/fsa/ByteSequenceIterator.java
|
8
|
+
#
|
9
|
+
# From some node of automaton, it iterates through all paths starting at that node to their end,
|
10
|
+
# and yields each full path packed into original dictionary bytes string.
|
11
|
+
class Enumerator
|
12
|
+
def initialize(fsa, node)
|
13
|
+
@fsa = fsa
|
14
|
+
@arcs_stack = []
|
15
|
+
@sequence = []
|
16
|
+
|
17
|
+
unless (first = fsa.first_arc(node)).zero? # rubocop:disable Style/GuardClause
|
18
|
+
arcs_stack << first
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def each
|
23
|
+
return to_enum(__method__) unless block_given?
|
24
|
+
|
25
|
+
while (el = advance)
|
26
|
+
yield el.pack('C*')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
include Enumerable
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
attr_reader :fsa, :arcs_stack, :sequence
|
35
|
+
|
36
|
+
# Method is left unsplit to leave original algorithm recognizable, hence rubocop:disable
|
37
|
+
def advance # rubocop:disable Metrics/AbcSize
|
38
|
+
until arcs_stack.empty?
|
39
|
+
arc = arcs_stack.last
|
40
|
+
|
41
|
+
if arc.zero?
|
42
|
+
# OC: Remove the current node from the queue.
|
43
|
+
arcs_stack.pop
|
44
|
+
next
|
45
|
+
end
|
46
|
+
|
47
|
+
# OC: Go to the next arc, but leave it on the stack
|
48
|
+
# so that we keep the recursion depth level accurate.
|
49
|
+
arcs_stack[-1] = fsa.next_arc(arc)
|
50
|
+
|
51
|
+
sequence[arcs_stack.count - 1] = fsa.arc_label(arc)
|
52
|
+
|
53
|
+
# OC: Recursively descend into the arc's node.
|
54
|
+
arcs_stack.push(fsa.end_node(arc)) unless fsa.terminal_arc?(arc)
|
55
|
+
|
56
|
+
if fsa.final_arc?(arc)
|
57
|
+
sequence.slice!(arcs_stack.count)
|
58
|
+
return sequence
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MorMor
|
4
|
+
class FSA
|
5
|
+
# Port of FSA5.java
|
6
|
+
#
|
7
|
+
# See constant description and other docs there:
|
8
|
+
# https://github.com/morfologik/morfologik-stemming/blob/master/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java
|
9
|
+
class FSA5 < FSA
|
10
|
+
BIT_FINAL_ARC = 1 << 0
|
11
|
+
BIT_LAST_ARC = 1 << 1
|
12
|
+
BIT_TARGET_NEXT = 1 << 2
|
13
|
+
ADDRESS_OFFSET = 1
|
14
|
+
|
15
|
+
def initialize(io)
|
16
|
+
@filler = io.getbyte
|
17
|
+
@annotation = io.getbyte
|
18
|
+
hgtl = io.getbyte
|
19
|
+
|
20
|
+
# OC: Determine if the automaton was compiled with NUMBERS. If so, modify
|
21
|
+
# ctl and goto fields accordingly.
|
22
|
+
|
23
|
+
# zverok: ???? This variables/flags doesn't used at all
|
24
|
+
# flags = [FLEXIBLE, STOPBIT, NEXTBIT]
|
25
|
+
# flags << NUMBERS if hgtl.anybits?(0xf0)
|
26
|
+
|
27
|
+
@node_data_length = (hgtl >> 4) & 0x0f
|
28
|
+
@gtl = hgtl & 0x0f
|
29
|
+
|
30
|
+
@arcs = io.read.unpack('c*')
|
31
|
+
end
|
32
|
+
|
33
|
+
def root_node
|
34
|
+
# OC: Skip dummy node marking terminating state.
|
35
|
+
epsilon_node = skip_arc(first_arc(0))
|
36
|
+
|
37
|
+
# OC: And follow the epsilon node's first (and only) arc.
|
38
|
+
destination_node_offset(first_arc(epsilon_node))
|
39
|
+
end
|
40
|
+
|
41
|
+
# Navigating through arcs
|
42
|
+
def first_arc(node)
|
43
|
+
@node_data_length + node
|
44
|
+
end
|
45
|
+
|
46
|
+
def end_node(arc)
|
47
|
+
destination_node_offset(arc)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Examining arcs
|
51
|
+
def arc_label(arc)
|
52
|
+
arcs[arc]
|
53
|
+
end
|
54
|
+
|
55
|
+
def final_arc?(arc)
|
56
|
+
arcs[arc + ADDRESS_OFFSET].allbits?(BIT_FINAL_ARC)
|
57
|
+
end
|
58
|
+
|
59
|
+
def last_arc?(arc)
|
60
|
+
arcs[arc + ADDRESS_OFFSET].allbits?(BIT_LAST_ARC)
|
61
|
+
end
|
62
|
+
|
63
|
+
def terminal_arc?(arc)
|
64
|
+
destination_node_offset(arc).zero?
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
attr_reader :arcs, :gtl
|
70
|
+
|
71
|
+
def decode_from_bytes(arcs, start, n)
|
72
|
+
(n - 1).downto(0).inject(0) { |r, i| r << 8 | (arcs[start + i] & 0xff) }
|
73
|
+
end
|
74
|
+
|
75
|
+
def destination_node_offset(arc)
|
76
|
+
if next_set?(arc)
|
77
|
+
# OC: The destination node follows this arc in the array.
|
78
|
+
skip_arc(arc)
|
79
|
+
else
|
80
|
+
# OC: The destination node address has to be extracted from the arc's
|
81
|
+
# goto field.
|
82
|
+
decode_from_bytes(arcs, arc + ADDRESS_OFFSET, gtl) >> 3
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def next_set?(arc)
|
87
|
+
arcs[arc + ADDRESS_OFFSET].allbits?(BIT_TARGET_NEXT)
|
88
|
+
end
|
89
|
+
|
90
|
+
# OC: Read the arc's layout and skip as many bytes, as needed.
|
91
|
+
def skip_arc(offset)
|
92
|
+
offset + if next_set?(offset)
|
93
|
+
1 + 1 # OC: label + flags
|
94
|
+
else
|
95
|
+
1 + gtl # OC: label + flags/address
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mormor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Victor Shepelev
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-06-21 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: backports
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.15.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 3.15.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rubygems-tasks
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: yard
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: forspell
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description:
|
84
|
+
email: zverok.offline@gmail.com
|
85
|
+
executables:
|
86
|
+
- mormor-dump
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- LICENSE.txt
|
91
|
+
- README.md
|
92
|
+
- exe/mormor-dump
|
93
|
+
- lib/mormor.rb
|
94
|
+
- lib/mormor/dictionary.rb
|
95
|
+
- lib/mormor/fsa.rb
|
96
|
+
- lib/mormor/fsa/cfsa2.rb
|
97
|
+
- lib/mormor/fsa/enumerator.rb
|
98
|
+
- lib/mormor/fsa/fsa5.rb
|
99
|
+
homepage: https://github.com/molybdenum-99/mormor
|
100
|
+
licenses:
|
101
|
+
- MIT
|
102
|
+
metadata: {}
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: 2.3.0
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.7.7
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: 'Morfologik dictionaries client in pure Ruby: POS tagging & spellcheck'
|
123
|
+
test_files: []
|