vn_tagger 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -5
- data/lib/vn_tagger/document.rb +29 -0
- data/lib/vn_tagger/tagger.rb +20 -14
- data/lib/vn_tagger/version.rb +1 -1
- data/lib/vn_tagger/word.rb +35 -0
- data/spec/vn_tagger/document_spec.rb +20 -0
- data/spec/vn_tagger/tagger_spec.rb +6 -5
- data/spec/vn_tagger/word_spec.rb +18 -0
- metadata +8 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 637f23fe16d647e9041131360de3ffd39319c504
         | 
| 4 | 
            +
              data.tar.gz: ca68ec84d04a2b0689d4f3dff85a9dd8a0a8f874
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 2430d567ed7dc55a0cea23ffbd0781dfd147afb7576a3593bc71cbef49779c74b737c5d73f88e67101294a42ad5f629f34f9c8a0974e1d34ce7399faded2a21d
         | 
| 7 | 
            +
              data.tar.gz: 2caaec9e11213317445b23651cfc64272d35208f6bd256a62a053b01f24750536564b043a9e695672fcf97c2825944e49f3ec83a5b7ddd385e9c5a3c65e30658
         | 
    
        data/README.md
    CHANGED
    
    | @@ -20,11 +20,10 @@ Or install it yourself as: | |
| 20 20 | 
             
            text = 'Mourinho là huấn luyện viên của Chelsea'
         | 
| 21 21 | 
             
            document = VnTagger::Tagger.tag(text)
         | 
| 22 22 |  | 
| 23 | 
            -
            document. | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
            keys.first.child.text #=> 'Mourinho'
         | 
| 23 | 
            +
            document.words.map { |word|
         | 
| 24 | 
            +
              [word.text, word.position]
         | 
| 25 | 
            +
            } # => [["Mourinho", "Np"], ["là", "V"], ["huấn luyện viên", "N"], ["của", "E"],
         | 
| 26 | 
            +
            ["Chelsea", "Np"]]
         | 
| 28 27 | 
             
            ```
         | 
| 29 28 |  | 
| 30 29 | 
             
            ## Contributing
         | 
| @@ -0,0 +1,29 @@ | |
| 1 | 
            +
            require 'word'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module VnTagger
         | 
| 4 | 
            +
              class Document
         | 
| 5 | 
            +
                attr_reader :xml_document
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                def initialize(xml_document)
         | 
| 8 | 
            +
                  @xml_document = xml_document
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                def words
         | 
| 12 | 
            +
                  @words ||= uncached_words
         | 
| 13 | 
            +
                end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                def uncached_words
         | 
| 16 | 
            +
                  xml_document.xpath('//w').map do |element|
         | 
| 17 | 
            +
                    Word.new(element.attr('pos'), element.child.text)
         | 
| 18 | 
            +
                  end
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                def filter_by_tag(tag = 'Proper noun')
         | 
| 22 | 
            +
                  words.select { |word| word.is_tag?(tag) }
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                def proper_noun_words
         | 
| 26 | 
            +
                  filter_by_tag('Proper noun').map(&:text)
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
            end
         | 
    
        data/lib/vn_tagger/tagger.rb
    CHANGED
    
    | @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            require 'nokogiri'
         | 
| 2 | 
            +
            require 'document'
         | 
| 2 3 |  | 
| 3 4 | 
             
            module VnTagger
         | 
| 4 5 | 
             
              class Tagger
         | 
| @@ -13,16 +14,28 @@ module VnTagger | |
| 13 14 |  | 
| 14 15 | 
             
                def tag
         | 
| 15 16 | 
             
                  write_to_file
         | 
| 16 | 
            -
                   | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 17 | 
            +
                  @success = system("cd #{ROOT_PATH}; #{COMMAND} -i #{INPUT} -o #{OUTPUT}")
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def xml_result
         | 
| 21 | 
            +
                  @xml_result ||= if @success
         | 
| 22 | 
            +
                                    file = File.open(OUTPUT)
         | 
| 23 | 
            +
                                    xml_document = Nokogiri::XML(file)
         | 
| 24 | 
            +
                                    file.close
         | 
| 25 | 
            +
                                    xml_document
         | 
| 26 | 
            +
                                  else
         | 
| 27 | 
            +
                                    Nokogiri::XML('')
         | 
| 28 | 
            +
                                  end
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def result
         | 
| 32 | 
            +
                  @result ||= Document.new(xml_result)
         | 
| 22 33 | 
             
                end
         | 
| 23 34 |  | 
| 24 35 | 
             
                def self.tag(text)
         | 
| 25 | 
            -
                  new(text) | 
| 36 | 
            +
                  tagger = new(text)
         | 
| 37 | 
            +
                  tagger.tag
         | 
| 38 | 
            +
                  tagger.result
         | 
| 26 39 | 
             
                end
         | 
| 27 40 |  | 
| 28 41 | 
             
                private
         | 
| @@ -33,13 +46,6 @@ module VnTagger | |
| 33 46 | 
             
                  file.close
         | 
| 34 47 | 
             
                end
         | 
| 35 48 |  | 
| 36 | 
            -
                def result_from_output
         | 
| 37 | 
            -
                  file = File.open(OUTPUT)
         | 
| 38 | 
            -
                  doc = Nokogiri::XML(file)
         | 
| 39 | 
            -
                  file.close
         | 
| 40 | 
            -
                  doc
         | 
| 41 | 
            -
                end
         | 
| 42 | 
            -
             | 
| 43 49 | 
             
                def normalize(string)
         | 
| 44 50 | 
             
                  string.to_s.gsub(/(\"|\')/, '')
         | 
| 45 51 | 
             
                end
         | 
    
        data/lib/vn_tagger/version.rb
    CHANGED
    
    
| @@ -0,0 +1,35 @@ | |
| 1 | 
            +
            module VnTagger
         | 
| 2 | 
            +
              class Word
         | 
| 3 | 
            +
                attr_reader :position, :text
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                WORD_POSITIONS = {
         | 
| 6 | 
            +
                  'Np' => 'Proper noun',
         | 
| 7 | 
            +
                  'Nc' => 'Classifier',
         | 
| 8 | 
            +
                  'Nu' => 'Unit noun',
         | 
| 9 | 
            +
                  'N' => 'Common noun',
         | 
| 10 | 
            +
                  'V' => 'Verb',
         | 
| 11 | 
            +
                  'A' => 'Adjective',
         | 
| 12 | 
            +
                  'P' => 'Pronoun',
         | 
| 13 | 
            +
                  'R' => 'Adverb',
         | 
| 14 | 
            +
                  'L' => 'Determiner',
         | 
| 15 | 
            +
                  'M' => 'Numeral',
         | 
| 16 | 
            +
                  'E' => 'Preposition',
         | 
| 17 | 
            +
                  'C' => 'Subordinating conjunction',
         | 
| 18 | 
            +
                  'CC' => 'Coordinating conjunction',
         | 
| 19 | 
            +
                  'I' => 'Interjection',
         | 
| 20 | 
            +
                  'T' => 'Auxiliary, modal words',
         | 
| 21 | 
            +
                  'Y' => 'Abbreviation',
         | 
| 22 | 
            +
                  'Z' => 'Bound morphemes',
         | 
| 23 | 
            +
                  'X' => 'Unknown'
         | 
| 24 | 
            +
                }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def initialize(position, text)
         | 
| 27 | 
            +
                  @position = position
         | 
| 28 | 
            +
                  @text = text
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def is_tag?(tag = 'Proper noun')
         | 
| 32 | 
            +
                  WORD_POSITIONS[position] == tag
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
              end
         | 
| 35 | 
            +
            end
         | 
| @@ -0,0 +1,20 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            describe VnTagger::Document do
         | 
| 4 | 
            +
              let(:xml_text) { '<doc><s><w pos="Np">HLV</w></s></doc>' }
         | 
| 5 | 
            +
              let(:xml_document) { Nokogiri::XML(xml_text) }
         | 
| 6 | 
            +
              let(:document) { described_class.new(xml_document) }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
              describe '#words' do
         | 
| 9 | 
            +
                it 'returns data extract from xml document' do
         | 
| 10 | 
            +
                  word = document.words.first
         | 
| 11 | 
            +
                  expect(word.text).to eq 'HLV'
         | 
| 12 | 
            +
                  expect(word.position).to eq 'Np'
         | 
| 13 | 
            +
                end
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              describe '#proper_noun_words' do
         | 
| 17 | 
            +
                subject { document.proper_noun_words }
         | 
| 18 | 
            +
                it { is_expected.to eq ['HLV'] }
         | 
| 19 | 
            +
              end
         | 
| 20 | 
            +
            end
         | 
| @@ -4,13 +4,14 @@ describe VnTagger::Tagger do | |
| 4 4 | 
             
              describe '#tag' do
         | 
| 5 5 | 
             
                let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
         | 
| 6 6 | 
             
                let(:tagger) { described_class.new(text) }
         | 
| 7 | 
            -
                let(:result) { tagger. | 
| 8 | 
            -
                let(: | 
| 7 | 
            +
                let(:result) { tagger.result }
         | 
| 8 | 
            +
                let(:word) { result.words.first }
         | 
| 9 9 |  | 
| 10 10 | 
             
                it 'returns xml tagged text' do
         | 
| 11 | 
            -
                   | 
| 12 | 
            -
                  expect( | 
| 13 | 
            -
                  expect( | 
| 11 | 
            +
                  tagger.tag
         | 
| 12 | 
            +
                  expect(result).to be_a(VnTagger::Document)
         | 
| 13 | 
            +
                  expect(word.position).to eq 'Np'
         | 
| 14 | 
            +
                  expect(word.text).to eq 'HLV'
         | 
| 14 15 | 
             
                end
         | 
| 15 16 | 
             
              end
         | 
| 16 17 | 
             
            end
         | 
| @@ -0,0 +1,18 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            describe VnTagger::Word do
         | 
| 4 | 
            +
              describe '#is_tag?' do
         | 
| 5 | 
            +
                let(:word) { described_class.new('N', 'text') }
         | 
| 6 | 
            +
                subject { word.is_tag?(tag) }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                context 'when tag is the same with word position' do
         | 
| 9 | 
            +
                  let(:tag) { 'Common noun' }
         | 
| 10 | 
            +
                  it { is_expected.to eq true }
         | 
| 11 | 
            +
                end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                context 'when tag is not the same with word position' do
         | 
| 14 | 
            +
                  let(:tag) { 'Verb' }
         | 
| 15 | 
            +
                  it { is_expected.to eq false }
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
              end
         | 
| 18 | 
            +
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: vn_tagger
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.2.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Hieu Nguyen
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2014-12- | 
| 11 | 
            +
            date: 2014-12-30 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: bundler
         | 
| @@ -86,8 +86,10 @@ files: | |
| 86 86 | 
             
            - lib/vn.hus.nlp.tokenizer-4.1.1.jar
         | 
| 87 87 | 
             
            - lib/vn.hus.nlp.utils-1.0.0.jar
         | 
| 88 88 | 
             
            - lib/vn_tagger.rb
         | 
| 89 | 
            +
            - lib/vn_tagger/document.rb
         | 
| 89 90 | 
             
            - lib/vn_tagger/tagger.rb
         | 
| 90 91 | 
             
            - lib/vn_tagger/version.rb
         | 
| 92 | 
            +
            - lib/vn_tagger/word.rb
         | 
| 91 93 | 
             
            - resources/automata/dfaLexicon.xml
         | 
| 92 94 | 
             
            - resources/automata/externalLexicon.xml
         | 
| 93 95 | 
             
            - resources/bigram/bigram.xml
         | 
| @@ -98,7 +100,9 @@ files: | |
| 98 100 | 
             
            - resources/normalization/rules.txt
         | 
| 99 101 | 
             
            - resources/prefix/namedEntityPrefix.xml
         | 
| 100 102 | 
             
            - spec/spec_helper.rb
         | 
| 103 | 
            +
            - spec/vn_tagger/document_spec.rb
         | 
| 101 104 | 
             
            - spec/vn_tagger/tagger_spec.rb
         | 
| 105 | 
            +
            - spec/vn_tagger/word_spec.rb
         | 
| 102 106 | 
             
            - spec/vn_tagger_spec.rb
         | 
| 103 107 | 
             
            - vn.hus.nlp.tagger-4.2.0.jar
         | 
| 104 108 | 
             
            - vnTagger.sh
         | 
| @@ -130,5 +134,7 @@ specification_version: 4 | |
| 130 134 | 
             
            summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.
         | 
| 131 135 | 
             
            test_files:
         | 
| 132 136 | 
             
            - spec/spec_helper.rb
         | 
| 137 | 
            +
            - spec/vn_tagger/document_spec.rb
         | 
| 133 138 | 
             
            - spec/vn_tagger/tagger_spec.rb
         | 
| 139 | 
            +
            - spec/vn_tagger/word_spec.rb
         | 
| 134 140 | 
             
            - spec/vn_tagger_spec.rb
         |