vn_tagger 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -5
- data/lib/vn_tagger/document.rb +29 -0
- data/lib/vn_tagger/tagger.rb +20 -14
- data/lib/vn_tagger/version.rb +1 -1
- data/lib/vn_tagger/word.rb +35 -0
- data/spec/vn_tagger/document_spec.rb +20 -0
- data/spec/vn_tagger/tagger_spec.rb +6 -5
- data/spec/vn_tagger/word_spec.rb +18 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 637f23fe16d647e9041131360de3ffd39319c504
|
4
|
+
data.tar.gz: ca68ec84d04a2b0689d4f3dff85a9dd8a0a8f874
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2430d567ed7dc55a0cea23ffbd0781dfd147afb7576a3593bc71cbef49779c74b737c5d73f88e67101294a42ad5f629f34f9c8a0974e1d34ce7399faded2a21d
|
7
|
+
data.tar.gz: 2caaec9e11213317445b23651cfc64272d35208f6bd256a62a053b01f24750536564b043a9e695672fcf97c2825944e49f3ec83a5b7ddd385e9c5a3c65e30658
|
data/README.md
CHANGED
@@ -20,11 +20,10 @@ Or install it yourself as:
|
|
20
20
|
text = 'Mourinho là huấn luyện viên của Chelsea'
|
21
21
|
document = VnTagger::Tagger.tag(text)
|
22
22
|
|
23
|
-
document.
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
keys.first.child.text #=> 'Mourinho'
|
23
|
+
document.words.map { |word|
|
24
|
+
[word.text, word.position]
|
25
|
+
} # => [["Mourinho", "Np"], ["là", "V"], ["huấn luyện viên", "N"], ["của", "E"],
|
26
|
+
["Chelsea", "Np"]]
|
28
27
|
```
|
29
28
|
|
30
29
|
## Contributing
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'word'
|
2
|
+
|
3
|
+
module VnTagger
|
4
|
+
class Document
|
5
|
+
attr_reader :xml_document
|
6
|
+
|
7
|
+
def initialize(xml_document)
|
8
|
+
@xml_document = xml_document
|
9
|
+
end
|
10
|
+
|
11
|
+
def words
|
12
|
+
@words ||= uncached_words
|
13
|
+
end
|
14
|
+
|
15
|
+
def uncached_words
|
16
|
+
xml_document.xpath('//w').map do |element|
|
17
|
+
Word.new(element.attr('pos'), element.child.text)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def filter_by_tag(tag = 'Proper noun')
|
22
|
+
words.select { |word| word.is_tag?(tag) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def proper_noun_words
|
26
|
+
filter_by_tag('Proper noun').map(&:text)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/vn_tagger/tagger.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'document'
|
2
3
|
|
3
4
|
module VnTagger
|
4
5
|
class Tagger
|
@@ -13,16 +14,28 @@ module VnTagger
|
|
13
14
|
|
14
15
|
def tag
|
15
16
|
write_to_file
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
@success = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
|
18
|
+
end
|
19
|
+
|
20
|
+
def xml_result
|
21
|
+
@xml_result ||= if @success
|
22
|
+
file = File.open(OUTPUT)
|
23
|
+
xml_document = Nokogiri::XML(file)
|
24
|
+
file.close
|
25
|
+
xml_document
|
26
|
+
else
|
27
|
+
Nokogiri::XML('')
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def result
|
32
|
+
@result ||= Document.new(xml_result)
|
22
33
|
end
|
23
34
|
|
24
35
|
def self.tag(text)
|
25
|
-
new(text)
|
36
|
+
tagger = new(text)
|
37
|
+
tagger.tag
|
38
|
+
tagger.result
|
26
39
|
end
|
27
40
|
|
28
41
|
private
|
@@ -33,13 +46,6 @@ module VnTagger
|
|
33
46
|
file.close
|
34
47
|
end
|
35
48
|
|
36
|
-
def result_from_output
|
37
|
-
file = File.open(OUTPUT)
|
38
|
-
doc = Nokogiri::XML(file)
|
39
|
-
file.close
|
40
|
-
doc
|
41
|
-
end
|
42
|
-
|
43
49
|
def normalize(string)
|
44
50
|
string.to_s.gsub(/(\"|\')/, '')
|
45
51
|
end
|
data/lib/vn_tagger/version.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
module VnTagger
|
2
|
+
class Word
|
3
|
+
attr_reader :position, :text
|
4
|
+
|
5
|
+
WORD_POSITIONS = {
|
6
|
+
'Np' => 'Proper noun',
|
7
|
+
'Nc' => 'Classifier',
|
8
|
+
'Nu' => 'Unit noun',
|
9
|
+
'N' => 'Common noun',
|
10
|
+
'V' => 'Verb',
|
11
|
+
'A' => 'Adjective',
|
12
|
+
'P' => 'Pronoun',
|
13
|
+
'R' => 'Adverb',
|
14
|
+
'L' => 'Determiner',
|
15
|
+
'M' => 'Numeral',
|
16
|
+
'E' => 'Preposition',
|
17
|
+
'C' => 'Subordinating conjunction',
|
18
|
+
'CC' => 'Coordinating conjunction',
|
19
|
+
'I' => 'Interjection',
|
20
|
+
'T' => 'Auxiliary, modal words',
|
21
|
+
'Y' => 'Abbreviation',
|
22
|
+
'Z' => 'Bound morphemes',
|
23
|
+
'X' => 'Unknown'
|
24
|
+
}
|
25
|
+
|
26
|
+
def initialize(position, text)
|
27
|
+
@position = position
|
28
|
+
@text = text
|
29
|
+
end
|
30
|
+
|
31
|
+
def is_tag?(tag = 'Proper noun')
|
32
|
+
WORD_POSITIONS[position] == tag
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe VnTagger::Document do
|
4
|
+
let(:xml_text) { '<doc><s><w pos="Np">HLV</w></s></doc>' }
|
5
|
+
let(:xml_document) { Nokogiri::XML(xml_text) }
|
6
|
+
let(:document) { described_class.new(xml_document) }
|
7
|
+
|
8
|
+
describe '#words' do
|
9
|
+
it 'returns data extract from xml document' do
|
10
|
+
word = document.words.first
|
11
|
+
expect(word.text).to eq 'HLV'
|
12
|
+
expect(word.position).to eq 'Np'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '#proper_noun_words' do
|
17
|
+
subject { document.proper_noun_words }
|
18
|
+
it { is_expected.to eq ['HLV'] }
|
19
|
+
end
|
20
|
+
end
|
@@ -4,13 +4,14 @@ describe VnTagger::Tagger do
|
|
4
4
|
describe '#tag' do
|
5
5
|
let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
|
6
6
|
let(:tagger) { described_class.new(text) }
|
7
|
-
let(:result) { tagger.
|
8
|
-
let(:
|
7
|
+
let(:result) { tagger.result }
|
8
|
+
let(:word) { result.words.first }
|
9
9
|
|
10
10
|
it 'returns xml tagged text' do
|
11
|
-
|
12
|
-
expect(
|
13
|
-
expect(
|
11
|
+
tagger.tag
|
12
|
+
expect(result).to be_a(VnTagger::Document)
|
13
|
+
expect(word.position).to eq 'Np'
|
14
|
+
expect(word.text).to eq 'HLV'
|
14
15
|
end
|
15
16
|
end
|
16
17
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe VnTagger::Word do
|
4
|
+
describe '#is_tag?' do
|
5
|
+
let(:word) { described_class.new('N', 'text') }
|
6
|
+
subject { word.is_tag?(tag) }
|
7
|
+
|
8
|
+
context 'when tag is the same with word position' do
|
9
|
+
let(:tag) { 'Common noun' }
|
10
|
+
it { is_expected.to eq true }
|
11
|
+
end
|
12
|
+
|
13
|
+
context 'when tag is not the same with word position' do
|
14
|
+
let(:tag) { 'Verb' }
|
15
|
+
it { is_expected.to eq false }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vn_tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hieu Nguyen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-12-
|
11
|
+
date: 2014-12-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -86,8 +86,10 @@ files:
|
|
86
86
|
- lib/vn.hus.nlp.tokenizer-4.1.1.jar
|
87
87
|
- lib/vn.hus.nlp.utils-1.0.0.jar
|
88
88
|
- lib/vn_tagger.rb
|
89
|
+
- lib/vn_tagger/document.rb
|
89
90
|
- lib/vn_tagger/tagger.rb
|
90
91
|
- lib/vn_tagger/version.rb
|
92
|
+
- lib/vn_tagger/word.rb
|
91
93
|
- resources/automata/dfaLexicon.xml
|
92
94
|
- resources/automata/externalLexicon.xml
|
93
95
|
- resources/bigram/bigram.xml
|
@@ -98,7 +100,9 @@ files:
|
|
98
100
|
- resources/normalization/rules.txt
|
99
101
|
- resources/prefix/namedEntityPrefix.xml
|
100
102
|
- spec/spec_helper.rb
|
103
|
+
- spec/vn_tagger/document_spec.rb
|
101
104
|
- spec/vn_tagger/tagger_spec.rb
|
105
|
+
- spec/vn_tagger/word_spec.rb
|
102
106
|
- spec/vn_tagger_spec.rb
|
103
107
|
- vn.hus.nlp.tagger-4.2.0.jar
|
104
108
|
- vnTagger.sh
|
@@ -130,5 +134,7 @@ specification_version: 4
|
|
130
134
|
summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.
|
131
135
|
test_files:
|
132
136
|
- spec/spec_helper.rb
|
137
|
+
- spec/vn_tagger/document_spec.rb
|
133
138
|
- spec/vn_tagger/tagger_spec.rb
|
139
|
+
- spec/vn_tagger/word_spec.rb
|
134
140
|
- spec/vn_tagger_spec.rb
|