vn_tagger 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -5
- data/lib/vn_tagger/document.rb +29 -0
- data/lib/vn_tagger/tagger.rb +20 -14
- data/lib/vn_tagger/version.rb +1 -1
- data/lib/vn_tagger/word.rb +35 -0
- data/spec/vn_tagger/document_spec.rb +20 -0
- data/spec/vn_tagger/tagger_spec.rb +6 -5
- data/spec/vn_tagger/word_spec.rb +18 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 637f23fe16d647e9041131360de3ffd39319c504
|
4
|
+
data.tar.gz: ca68ec84d04a2b0689d4f3dff85a9dd8a0a8f874
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2430d567ed7dc55a0cea23ffbd0781dfd147afb7576a3593bc71cbef49779c74b737c5d73f88e67101294a42ad5f629f34f9c8a0974e1d34ce7399faded2a21d
|
7
|
+
data.tar.gz: 2caaec9e11213317445b23651cfc64272d35208f6bd256a62a053b01f24750536564b043a9e695672fcf97c2825944e49f3ec83a5b7ddd385e9c5a3c65e30658
|
data/README.md
CHANGED
@@ -20,11 +20,10 @@ Or install it yourself as:
|
|
20
20
|
text = 'Mourinho là huấn luyện viên của Chelsea'
|
21
21
|
document = VnTagger::Tagger.tag(text)
|
22
22
|
|
23
|
-
document.
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
keys.first.child.text #=> 'Mourinho'
|
23
|
+
document.words.map { |word|
|
24
|
+
[word.text, word.position]
|
25
|
+
} # => [["Mourinho", "Np"], ["là", "V"], ["huấn luyện viên", "N"], ["của", "E"],
|
26
|
+
["Chelsea", "Np"]]
|
28
27
|
```
|
29
28
|
|
30
29
|
## Contributing
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'word'
|
2
|
+
|
3
|
+
module VnTagger
|
4
|
+
class Document
|
5
|
+
attr_reader :xml_document
|
6
|
+
|
7
|
+
def initialize(xml_document)
|
8
|
+
@xml_document = xml_document
|
9
|
+
end
|
10
|
+
|
11
|
+
def words
|
12
|
+
@words ||= uncached_words
|
13
|
+
end
|
14
|
+
|
15
|
+
def uncached_words
|
16
|
+
xml_document.xpath('//w').map do |element|
|
17
|
+
Word.new(element.attr('pos'), element.child.text)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def filter_by_tag(tag = 'Proper noun')
|
22
|
+
words.select { |word| word.is_tag?(tag) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def proper_noun_words
|
26
|
+
filter_by_tag('Proper noun').map(&:text)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/vn_tagger/tagger.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'document'
|
2
3
|
|
3
4
|
module VnTagger
|
4
5
|
class Tagger
|
@@ -13,16 +14,28 @@ module VnTagger
|
|
13
14
|
|
14
15
|
def tag
|
15
16
|
write_to_file
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
@success = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
|
18
|
+
end
|
19
|
+
|
20
|
+
def xml_result
|
21
|
+
@xml_result ||= if @success
|
22
|
+
file = File.open(OUTPUT)
|
23
|
+
xml_document = Nokogiri::XML(file)
|
24
|
+
file.close
|
25
|
+
xml_document
|
26
|
+
else
|
27
|
+
Nokogiri::XML('')
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def result
|
32
|
+
@result ||= Document.new(xml_result)
|
22
33
|
end
|
23
34
|
|
24
35
|
def self.tag(text)
|
25
|
-
new(text)
|
36
|
+
tagger = new(text)
|
37
|
+
tagger.tag
|
38
|
+
tagger.result
|
26
39
|
end
|
27
40
|
|
28
41
|
private
|
@@ -33,13 +46,6 @@ module VnTagger
|
|
33
46
|
file.close
|
34
47
|
end
|
35
48
|
|
36
|
-
def result_from_output
|
37
|
-
file = File.open(OUTPUT)
|
38
|
-
doc = Nokogiri::XML(file)
|
39
|
-
file.close
|
40
|
-
doc
|
41
|
-
end
|
42
|
-
|
43
49
|
def normalize(string)
|
44
50
|
string.to_s.gsub(/(\"|\')/, '')
|
45
51
|
end
|
data/lib/vn_tagger/version.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
module VnTagger
|
2
|
+
class Word
|
3
|
+
attr_reader :position, :text
|
4
|
+
|
5
|
+
WORD_POSITIONS = {
|
6
|
+
'Np' => 'Proper noun',
|
7
|
+
'Nc' => 'Classifier',
|
8
|
+
'Nu' => 'Unit noun',
|
9
|
+
'N' => 'Common noun',
|
10
|
+
'V' => 'Verb',
|
11
|
+
'A' => 'Adjective',
|
12
|
+
'P' => 'Pronoun',
|
13
|
+
'R' => 'Adverb',
|
14
|
+
'L' => 'Determiner',
|
15
|
+
'M' => 'Numeral',
|
16
|
+
'E' => 'Preposition',
|
17
|
+
'C' => 'Subordinating conjunction',
|
18
|
+
'CC' => 'Coordinating conjunction',
|
19
|
+
'I' => 'Interjection',
|
20
|
+
'T' => 'Auxiliary, modal words',
|
21
|
+
'Y' => 'Abbreviation',
|
22
|
+
'Z' => 'Bound morphemes',
|
23
|
+
'X' => 'Unknown'
|
24
|
+
}
|
25
|
+
|
26
|
+
def initialize(position, text)
|
27
|
+
@position = position
|
28
|
+
@text = text
|
29
|
+
end
|
30
|
+
|
31
|
+
def is_tag?(tag = 'Proper noun')
|
32
|
+
WORD_POSITIONS[position] == tag
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe VnTagger::Document do
|
4
|
+
let(:xml_text) { '<doc><s><w pos="Np">HLV</w></s></doc>' }
|
5
|
+
let(:xml_document) { Nokogiri::XML(xml_text) }
|
6
|
+
let(:document) { described_class.new(xml_document) }
|
7
|
+
|
8
|
+
describe '#words' do
|
9
|
+
it 'returns data extract from xml document' do
|
10
|
+
word = document.words.first
|
11
|
+
expect(word.text).to eq 'HLV'
|
12
|
+
expect(word.position).to eq 'Np'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '#proper_noun_words' do
|
17
|
+
subject { document.proper_noun_words }
|
18
|
+
it { is_expected.to eq ['HLV'] }
|
19
|
+
end
|
20
|
+
end
|
@@ -4,13 +4,14 @@ describe VnTagger::Tagger do
|
|
4
4
|
describe '#tag' do
|
5
5
|
let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
|
6
6
|
let(:tagger) { described_class.new(text) }
|
7
|
-
let(:result) { tagger.
|
8
|
-
let(:
|
7
|
+
let(:result) { tagger.result }
|
8
|
+
let(:word) { result.words.first }
|
9
9
|
|
10
10
|
it 'returns xml tagged text' do
|
11
|
-
|
12
|
-
expect(
|
13
|
-
expect(
|
11
|
+
tagger.tag
|
12
|
+
expect(result).to be_a(VnTagger::Document)
|
13
|
+
expect(word.position).to eq 'Np'
|
14
|
+
expect(word.text).to eq 'HLV'
|
14
15
|
end
|
15
16
|
end
|
16
17
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe VnTagger::Word do
|
4
|
+
describe '#is_tag?' do
|
5
|
+
let(:word) { described_class.new('N', 'text') }
|
6
|
+
subject { word.is_tag?(tag) }
|
7
|
+
|
8
|
+
context 'when tag is the same with word position' do
|
9
|
+
let(:tag) { 'Common noun' }
|
10
|
+
it { is_expected.to eq true }
|
11
|
+
end
|
12
|
+
|
13
|
+
context 'when tag is not the same with word position' do
|
14
|
+
let(:tag) { 'Verb' }
|
15
|
+
it { is_expected.to eq false }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vn_tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hieu Nguyen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-12-
|
11
|
+
date: 2014-12-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -86,8 +86,10 @@ files:
|
|
86
86
|
- lib/vn.hus.nlp.tokenizer-4.1.1.jar
|
87
87
|
- lib/vn.hus.nlp.utils-1.0.0.jar
|
88
88
|
- lib/vn_tagger.rb
|
89
|
+
- lib/vn_tagger/document.rb
|
89
90
|
- lib/vn_tagger/tagger.rb
|
90
91
|
- lib/vn_tagger/version.rb
|
92
|
+
- lib/vn_tagger/word.rb
|
91
93
|
- resources/automata/dfaLexicon.xml
|
92
94
|
- resources/automata/externalLexicon.xml
|
93
95
|
- resources/bigram/bigram.xml
|
@@ -98,7 +100,9 @@ files:
|
|
98
100
|
- resources/normalization/rules.txt
|
99
101
|
- resources/prefix/namedEntityPrefix.xml
|
100
102
|
- spec/spec_helper.rb
|
103
|
+
- spec/vn_tagger/document_spec.rb
|
101
104
|
- spec/vn_tagger/tagger_spec.rb
|
105
|
+
- spec/vn_tagger/word_spec.rb
|
102
106
|
- spec/vn_tagger_spec.rb
|
103
107
|
- vn.hus.nlp.tagger-4.2.0.jar
|
104
108
|
- vnTagger.sh
|
@@ -130,5 +134,7 @@ specification_version: 4
|
|
130
134
|
summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.
|
131
135
|
test_files:
|
132
136
|
- spec/spec_helper.rb
|
137
|
+
- spec/vn_tagger/document_spec.rb
|
133
138
|
- spec/vn_tagger/tagger_spec.rb
|
139
|
+
- spec/vn_tagger/word_spec.rb
|
134
140
|
- spec/vn_tagger_spec.rb
|