vn_tagger 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb1eff57d9490c011b4d340c0d167a20245b8c5c
4
- data.tar.gz: b411874dc1e8f7b574cc8995224f6f091bbb5206
3
+ metadata.gz: 637f23fe16d647e9041131360de3ffd39319c504
4
+ data.tar.gz: ca68ec84d04a2b0689d4f3dff85a9dd8a0a8f874
5
5
  SHA512:
6
- metadata.gz: ddce5c31d971c282ba08599a5b49cd38d46ec5e0c178b5b455b47324c941470ef8b4f15fa0f5c8162460105fcaa39344809100a6e2b6f2a856e5df7e6412716b
7
- data.tar.gz: 7c96f95660818f66a11607d5056546e188e18cdb7a504babe57bb27eb3fcd4102e244e9768982a442b624e3d4a1929f223c35970dd6349d843a4a2f62f67dc3d
6
+ metadata.gz: 2430d567ed7dc55a0cea23ffbd0781dfd147afb7576a3593bc71cbef49779c74b737c5d73f88e67101294a42ad5f629f34f9c8a0974e1d34ce7399faded2a21d
7
+ data.tar.gz: 2caaec9e11213317445b23651cfc64272d35208f6bd256a62a053b01f24750536564b043a9e695672fcf97c2825944e49f3ec83a5b7ddd385e9c5a3c65e30658
data/README.md CHANGED
@@ -20,11 +20,10 @@ Or install it yourself as:
20
20
  text = 'Mourinho là huấn luyện viên của Chelsea'
21
21
  document = VnTagger::Tagger.tag(text)
22
22
 
23
- document.to_xml # => "<?xml version=\"1.0\"?>\n<doc>\n\t<s>\n\t\t<w pos=\"Np\">Mourinho</w>\n\t\t<w pos=\"V\">l&#xE0;</w>\n\t\t<w pos=\"N\">hu&#x1EA5;n luy&#x1EC7;n vi&#xEA;n</w>\n\t\t<w pos=\"E\">c&#x1EE7;a</w>\n\t\t<w pos=\"Np\">Chelsea</w>\n\t\t<w pos=\".\">.</w>\n\t</s>\n</doc>\n"
24
- document.class # => Nokogiri::Xml::Document
25
- keys = document.xpath("//w")
26
- keys.first.attr('pos') # => 'Np'
27
- keys.first.child.text #=> 'Mourinho'
23
+ document.words.map { |word|
24
+ [word.text, word.position]
25
+ } # => [["Mourinho", "Np"], ["là", "V"], ["huấn luyện viên", "N"], ["của", "E"],
26
+ ["Chelsea", "Np"]]
28
27
  ```
29
28
 
30
29
  ## Contributing
@@ -0,0 +1,29 @@
1
+ require 'word'
2
+
3
+ module VnTagger
4
+ class Document
5
+ attr_reader :xml_document
6
+
7
+ def initialize(xml_document)
8
+ @xml_document = xml_document
9
+ end
10
+
11
+ def words
12
+ @words ||= uncached_words
13
+ end
14
+
15
+ def uncached_words
16
+ xml_document.xpath('//w').map do |element|
17
+ Word.new(element.attr('pos'), element.child.text)
18
+ end
19
+ end
20
+
21
+ def filter_by_tag(tag = 'Proper noun')
22
+ words.select { |word| word.is_tag?(tag) }
23
+ end
24
+
25
+ def proper_noun_words
26
+ filter_by_tag('Proper noun').map(&:text)
27
+ end
28
+ end
29
+ end
@@ -1,4 +1,5 @@
1
1
  require 'nokogiri'
2
+ require 'document'
2
3
 
3
4
  module VnTagger
4
5
  class Tagger
@@ -13,16 +14,28 @@ module VnTagger
13
14
 
14
15
  def tag
15
16
  write_to_file
16
- wasGood = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
17
- if wasGood
18
- result_from_output
19
- else
20
- nil
21
- end
17
+ @success = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
18
+ end
19
+
20
+ def xml_result
21
+ @xml_result ||= if @success
22
+ file = File.open(OUTPUT)
23
+ xml_document = Nokogiri::XML(file)
24
+ file.close
25
+ xml_document
26
+ else
27
+ Nokogiri::XML('')
28
+ end
29
+ end
30
+
31
+ def result
32
+ @result ||= Document.new(xml_result)
22
33
  end
23
34
 
24
35
  def self.tag(text)
25
- new(text).tag
36
+ tagger = new(text)
37
+ tagger.tag
38
+ tagger.result
26
39
  end
27
40
 
28
41
  private
@@ -33,13 +46,6 @@ module VnTagger
33
46
  file.close
34
47
  end
35
48
 
36
- def result_from_output
37
- file = File.open(OUTPUT)
38
- doc = Nokogiri::XML(file)
39
- file.close
40
- doc
41
- end
42
-
43
49
  def normalize(string)
44
50
  string.to_s.gsub(/(\"|\')/, '')
45
51
  end
@@ -1,3 +1,3 @@
1
1
  module VnTagger
2
- VERSION = '0.1.0'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -0,0 +1,35 @@
1
+ module VnTagger
2
+ class Word
3
+ attr_reader :position, :text
4
+
5
+ WORD_POSITIONS = {
6
+ 'Np' => 'Proper noun',
7
+ 'Nc' => 'Classifier',
8
+ 'Nu' => 'Unit noun',
9
+ 'N' => 'Common noun',
10
+ 'V' => 'Verb',
11
+ 'A' => 'Adjective',
12
+ 'P' => 'Pronoun',
13
+ 'R' => 'Adverb',
14
+ 'L' => 'Determiner',
15
+ 'M' => 'Numeral',
16
+ 'E' => 'Preposition',
17
+ 'C' => 'Subordinating conjunction',
18
+ 'CC' => 'Coordinating conjunction',
19
+ 'I' => 'Interjection',
20
+ 'T' => 'Auxiliary, modal words',
21
+ 'Y' => 'Abbreviation',
22
+ 'Z' => 'Bound morphemes',
23
+ 'X' => 'Unknown'
24
+ }
25
+
26
+ def initialize(position, text)
27
+ @position = position
28
+ @text = text
29
+ end
30
+
31
+ def is_tag?(tag = 'Proper noun')
32
+ WORD_POSITIONS[position] == tag
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe VnTagger::Document do
4
+ let(:xml_text) { '<doc><s><w pos="Np">HLV</w></s></doc>' }
5
+ let(:xml_document) { Nokogiri::XML(xml_text) }
6
+ let(:document) { described_class.new(xml_document) }
7
+
8
+ describe '#words' do
9
+ it 'returns data extract from xml document' do
10
+ word = document.words.first
11
+ expect(word.text).to eq 'HLV'
12
+ expect(word.position).to eq 'Np'
13
+ end
14
+ end
15
+
16
+ describe '#proper_noun_words' do
17
+ subject { document.proper_noun_words }
18
+ it { is_expected.to eq ['HLV'] }
19
+ end
20
+ end
@@ -4,13 +4,14 @@ describe VnTagger::Tagger do
4
4
  describe '#tag' do
5
5
  let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
6
6
  let(:tagger) { described_class.new(text) }
7
- let(:result) { tagger.tag }
8
- let(:key) { result.xpath("//w").first }
7
+ let(:result) { tagger.result }
8
+ let(:word) { result.words.first }
9
9
 
10
10
  it 'returns xml tagged text' do
11
- expect(result).to be_a(Nokogiri::XML::Document)
12
- expect(key.attr('pos')).to eq 'Np'
13
- expect(key.child.text).to eq 'HLV'
11
+ tagger.tag
12
+ expect(result).to be_a(VnTagger::Document)
13
+ expect(word.position).to eq 'Np'
14
+ expect(word.text).to eq 'HLV'
14
15
  end
15
16
  end
16
17
  end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe VnTagger::Word do
4
+ describe '#is_tag?' do
5
+ let(:word) { described_class.new('N', 'text') }
6
+ subject { word.is_tag?(tag) }
7
+
8
+ context 'when tag is the same with word position' do
9
+ let(:tag) { 'Common noun' }
10
+ it { is_expected.to eq true }
11
+ end
12
+
13
+ context 'when tag is not the same with word position' do
14
+ let(:tag) { 'Verb' }
15
+ it { is_expected.to eq false }
16
+ end
17
+ end
18
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vn_tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hieu Nguyen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-22 00:00:00.000000000 Z
11
+ date: 2014-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -86,8 +86,10 @@ files:
86
86
  - lib/vn.hus.nlp.tokenizer-4.1.1.jar
87
87
  - lib/vn.hus.nlp.utils-1.0.0.jar
88
88
  - lib/vn_tagger.rb
89
+ - lib/vn_tagger/document.rb
89
90
  - lib/vn_tagger/tagger.rb
90
91
  - lib/vn_tagger/version.rb
92
+ - lib/vn_tagger/word.rb
91
93
  - resources/automata/dfaLexicon.xml
92
94
  - resources/automata/externalLexicon.xml
93
95
  - resources/bigram/bigram.xml
@@ -98,7 +100,9 @@ files:
98
100
  - resources/normalization/rules.txt
99
101
  - resources/prefix/namedEntityPrefix.xml
100
102
  - spec/spec_helper.rb
103
+ - spec/vn_tagger/document_spec.rb
101
104
  - spec/vn_tagger/tagger_spec.rb
105
+ - spec/vn_tagger/word_spec.rb
102
106
  - spec/vn_tagger_spec.rb
103
107
  - vn.hus.nlp.tagger-4.2.0.jar
104
108
  - vnTagger.sh
@@ -130,5 +134,7 @@ specification_version: 4
130
134
  summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.
131
135
  test_files:
132
136
  - spec/spec_helper.rb
137
+ - spec/vn_tagger/document_spec.rb
133
138
  - spec/vn_tagger/tagger_spec.rb
139
+ - spec/vn_tagger/word_spec.rb
134
140
  - spec/vn_tagger_spec.rb