vn_tagger 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb1eff57d9490c011b4d340c0d167a20245b8c5c
4
- data.tar.gz: b411874dc1e8f7b574cc8995224f6f091bbb5206
3
+ metadata.gz: 637f23fe16d647e9041131360de3ffd39319c504
4
+ data.tar.gz: ca68ec84d04a2b0689d4f3dff85a9dd8a0a8f874
5
5
  SHA512:
6
- metadata.gz: ddce5c31d971c282ba08599a5b49cd38d46ec5e0c178b5b455b47324c941470ef8b4f15fa0f5c8162460105fcaa39344809100a6e2b6f2a856e5df7e6412716b
7
- data.tar.gz: 7c96f95660818f66a11607d5056546e188e18cdb7a504babe57bb27eb3fcd4102e244e9768982a442b624e3d4a1929f223c35970dd6349d843a4a2f62f67dc3d
6
+ metadata.gz: 2430d567ed7dc55a0cea23ffbd0781dfd147afb7576a3593bc71cbef49779c74b737c5d73f88e67101294a42ad5f629f34f9c8a0974e1d34ce7399faded2a21d
7
+ data.tar.gz: 2caaec9e11213317445b23651cfc64272d35208f6bd256a62a053b01f24750536564b043a9e695672fcf97c2825944e49f3ec83a5b7ddd385e9c5a3c65e30658
data/README.md CHANGED
@@ -20,11 +20,10 @@ Or install it yourself as:
20
20
  text = 'Mourinho là huấn luyện viên của Chelsea'
21
21
  document = VnTagger::Tagger.tag(text)
22
22
 
23
- document.to_xml # => "<?xml version=\"1.0\"?>\n<doc>\n\t<s>\n\t\t<w pos=\"Np\">Mourinho</w>\n\t\t<w pos=\"V\">l&#xE0;</w>\n\t\t<w pos=\"N\">hu&#x1EA5;n luy&#x1EC7;n vi&#xEA;n</w>\n\t\t<w pos=\"E\">c&#x1EE7;a</w>\n\t\t<w pos=\"Np\">Chelsea</w>\n\t\t<w pos=\".\">.</w>\n\t</s>\n</doc>\n"
24
- document.class # => Nokogiri::Xml::Document
25
- keys = document.xpath("//w")
26
- keys.first.attr('pos') # => 'Np'
27
- keys.first.child.text #=> 'Mourinho'
23
+ document.words.map { |word|
24
+ [word.text, word.position]
25
+ } # => [["Mourinho", "Np"], ["là", "V"], ["huấn luyện viên", "N"], ["của", "E"],
26
+ ["Chelsea", "Np"]]
28
27
  ```
29
28
 
30
29
  ## Contributing
@@ -0,0 +1,29 @@
1
+ require 'word'
2
+
3
+ module VnTagger
4
+ class Document
5
+ attr_reader :xml_document
6
+
7
+ def initialize(xml_document)
8
+ @xml_document = xml_document
9
+ end
10
+
11
+ def words
12
+ @words ||= uncached_words
13
+ end
14
+
15
+ def uncached_words
16
+ xml_document.xpath('//w').map do |element|
17
+ Word.new(element.attr('pos'), element.child.text)
18
+ end
19
+ end
20
+
21
+ def filter_by_tag(tag = 'Proper noun')
22
+ words.select { |word| word.is_tag?(tag) }
23
+ end
24
+
25
+ def proper_noun_words
26
+ filter_by_tag('Proper noun').map(&:text)
27
+ end
28
+ end
29
+ end
@@ -1,4 +1,5 @@
1
1
  require 'nokogiri'
2
+ require 'document'
2
3
 
3
4
  module VnTagger
4
5
  class Tagger
@@ -13,16 +14,28 @@ module VnTagger
13
14
 
14
15
  def tag
15
16
  write_to_file
16
- wasGood = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
17
- if wasGood
18
- result_from_output
19
- else
20
- nil
21
- end
17
+ @success = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
18
+ end
19
+
20
+ def xml_result
21
+ @xml_result ||= if @success
22
+ file = File.open(OUTPUT)
23
+ xml_document = Nokogiri::XML(file)
24
+ file.close
25
+ xml_document
26
+ else
27
+ Nokogiri::XML('')
28
+ end
29
+ end
30
+
31
+ def result
32
+ @result ||= Document.new(xml_result)
22
33
  end
23
34
 
24
35
  def self.tag(text)
25
- new(text).tag
36
+ tagger = new(text)
37
+ tagger.tag
38
+ tagger.result
26
39
  end
27
40
 
28
41
  private
@@ -33,13 +46,6 @@ module VnTagger
33
46
  file.close
34
47
  end
35
48
 
36
- def result_from_output
37
- file = File.open(OUTPUT)
38
- doc = Nokogiri::XML(file)
39
- file.close
40
- doc
41
- end
42
-
43
49
  def normalize(string)
44
50
  string.to_s.gsub(/(\"|\')/, '')
45
51
  end
@@ -1,3 +1,3 @@
1
1
  module VnTagger
2
- VERSION = '0.1.0'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -0,0 +1,35 @@
1
+ module VnTagger
2
+ class Word
3
+ attr_reader :position, :text
4
+
5
+ WORD_POSITIONS = {
6
+ 'Np' => 'Proper noun',
7
+ 'Nc' => 'Classifier',
8
+ 'Nu' => 'Unit noun',
9
+ 'N' => 'Common noun',
10
+ 'V' => 'Verb',
11
+ 'A' => 'Adjective',
12
+ 'P' => 'Pronoun',
13
+ 'R' => 'Adverb',
14
+ 'L' => 'Determiner',
15
+ 'M' => 'Numeral',
16
+ 'E' => 'Preposition',
17
+ 'C' => 'Subordinating conjunction',
18
+ 'CC' => 'Coordinating conjunction',
19
+ 'I' => 'Interjection',
20
+ 'T' => 'Auxiliary, modal words',
21
+ 'Y' => 'Abbreviation',
22
+ 'Z' => 'Bound morphemes',
23
+ 'X' => 'Unknown'
24
+ }
25
+
26
+ def initialize(position, text)
27
+ @position = position
28
+ @text = text
29
+ end
30
+
31
+ def is_tag?(tag = 'Proper noun')
32
+ WORD_POSITIONS[position] == tag
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe VnTagger::Document do
4
+ let(:xml_text) { '<doc><s><w pos="Np">HLV</w></s></doc>' }
5
+ let(:xml_document) { Nokogiri::XML(xml_text) }
6
+ let(:document) { described_class.new(xml_document) }
7
+
8
+ describe '#words' do
9
+ it 'returns data extract from xml document' do
10
+ word = document.words.first
11
+ expect(word.text).to eq 'HLV'
12
+ expect(word.position).to eq 'Np'
13
+ end
14
+ end
15
+
16
+ describe '#proper_noun_words' do
17
+ subject { document.proper_noun_words }
18
+ it { is_expected.to eq ['HLV'] }
19
+ end
20
+ end
@@ -4,13 +4,14 @@ describe VnTagger::Tagger do
4
4
  describe '#tag' do
5
5
  let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
6
6
  let(:tagger) { described_class.new(text) }
7
- let(:result) { tagger.tag }
8
- let(:key) { result.xpath("//w").first }
7
+ let(:result) { tagger.result }
8
+ let(:word) { result.words.first }
9
9
 
10
10
  it 'returns xml tagged text' do
11
- expect(result).to be_a(Nokogiri::XML::Document)
12
- expect(key.attr('pos')).to eq 'Np'
13
- expect(key.child.text).to eq 'HLV'
11
+ tagger.tag
12
+ expect(result).to be_a(VnTagger::Document)
13
+ expect(word.position).to eq 'Np'
14
+ expect(word.text).to eq 'HLV'
14
15
  end
15
16
  end
16
17
  end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe VnTagger::Word do
4
+ describe '#is_tag?' do
5
+ let(:word) { described_class.new('N', 'text') }
6
+ subject { word.is_tag?(tag) }
7
+
8
+ context 'when tag is the same with word position' do
9
+ let(:tag) { 'Common noun' }
10
+ it { is_expected.to eq true }
11
+ end
12
+
13
+ context 'when tag is not the same with word position' do
14
+ let(:tag) { 'Verb' }
15
+ it { is_expected.to eq false }
16
+ end
17
+ end
18
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vn_tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hieu Nguyen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-22 00:00:00.000000000 Z
11
+ date: 2014-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -86,8 +86,10 @@ files:
86
86
  - lib/vn.hus.nlp.tokenizer-4.1.1.jar
87
87
  - lib/vn.hus.nlp.utils-1.0.0.jar
88
88
  - lib/vn_tagger.rb
89
+ - lib/vn_tagger/document.rb
89
90
  - lib/vn_tagger/tagger.rb
90
91
  - lib/vn_tagger/version.rb
92
+ - lib/vn_tagger/word.rb
91
93
  - resources/automata/dfaLexicon.xml
92
94
  - resources/automata/externalLexicon.xml
93
95
  - resources/bigram/bigram.xml
@@ -98,7 +100,9 @@ files:
98
100
  - resources/normalization/rules.txt
99
101
  - resources/prefix/namedEntityPrefix.xml
100
102
  - spec/spec_helper.rb
103
+ - spec/vn_tagger/document_spec.rb
101
104
  - spec/vn_tagger/tagger_spec.rb
105
+ - spec/vn_tagger/word_spec.rb
102
106
  - spec/vn_tagger_spec.rb
103
107
  - vn.hus.nlp.tagger-4.2.0.jar
104
108
  - vnTagger.sh
@@ -130,5 +134,7 @@ specification_version: 4
130
134
  summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.
131
135
  test_files:
132
136
  - spec/spec_helper.rb
137
+ - spec/vn_tagger/document_spec.rb
133
138
  - spec/vn_tagger/tagger_spec.rb
139
+ - spec/vn_tagger/word_spec.rb
134
140
  - spec/vn_tagger_spec.rb