vn_tagger 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ <?xml version="1.0" encoding="utf-8" standalone="yes"?>
2
+ <!-- (C) phuonglh@gmail.com -->
3
+ <corpus id="resources/lexers/lexers.xml">
4
+ <body>
5
+ <w msd="numbersign">#</w>
6
+ <w msd="ampersand">&amp;</w>
7
+ <w msd="date_mm-dd-yy">(0*[1-9]|1[012])-(0*[1-9]|[12][0-9]|3[01])-\d\d</w>
8
+ <w msd="date_mm/dd/yy">(0*[1-9]|1[012])/(0*[1-9]|[12][0-9]|3[01])/\d\d</w>
9
+ <w msd="date_mm.dd.yy">(0*[1-9]|1[012])\.(0*[1-9]|[12][0-9]|3[01])\.\d\d</w>
10
+ <w msd="date_dd-mm-yy">([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-\d\d</w>
11
+ <w msd="date_dd/mm/yy">([12][0-9]|3[01]|0*[1-9])/(1[012]||0*[1-9])/\d\d</w>
12
+ <w msd="date_dd.mm.yy">([12][0-9]|3[01]|0*[1-9])[\.](1[012]||0*[1-9])[\.]\d\d</w>
13
+ <w msd="date_dd-mm-yyyy">([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-(19|20)\d\d</w>
14
+ <w msd="date_dd/mm/yyyy">([12][0-9]|3[01]|0*[1-9])/(1[012]||0*[1-9])/(19|20)\d\d</w>
15
+ <w msd="date_dd.mm.yyyy">([12][0-9]|3[01]|0*[1-9])\.(1[012]||0*[1-9])\.(19|20)\d\d</w>
16
+ <w msd="date_dd-mm">(0*[1-9]|[12][0-9]|3[01])[-/\.](1[012]|0*[1-9])</w>
17
+ <w msd="date_mm-yy">(0*[1-9]|1[012])[-/\.]\d\d</w>
18
+ <w msd="date_mm-yyyy">(0*[1-9]|1[012])[-/\.](19|20)\d\d</w>
19
+ <w msd="date_yyyy">(19|20)\d\d</w>
20
+ <w msd="date_mm-dd-yyyy">(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])-(19|20)\d\d</w>
21
+ <w msd="date_mm/dd/yyyy">(0*[1-9]|1[012])/([12][0-9]|3[01]|0*[1-9])/(19|20)\d\d</w>
22
+ <w msd="date_mm.dd.yyyy">(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])\.(19|20)\d\d</w>
23
+ <w msd="date_yyyy-mm-dd">(19|20)\d\d-(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])</w>
24
+ <w msd="date_yyyy/mm/dd">(19|20)\d\d/(0*[1-9]|1[012])/([12][0-9]|3[01]|0*[1-9])</w>
25
+ <w msd="date_yyyy.mm.dd">(19|20)\d\d\.(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])</w>
26
+ <w msd="hhmmss">([0-1]\d|[2][0-3]):[0-5]\d:[0-5]\d</w>
27
+ <w msd="percent">([0-9]*[\.,])?[0-9]+%</w>
28
+ <w msd="name1">[A-ZÁÂĐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*</w>
29
+ <w msd="name2">([A-ZÁÂĐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*)(\s+[A-ZÁÂĐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*</w>
30
+ <w msd="phrase">([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ])?([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\s])*([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz])+$*</w>
31
+ <w msd="allcaps">([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)(\s*[AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)*[^aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz/\)\(\?!\.;:,\-"']</w>
32
+ <w msd="fraction">(\d+)/(\d+)</w>
33
+ <w msd="email">(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})</w>
34
+ <w msd="return">(^$)</w>
35
+ <w msd="fslash">/</w>
36
+ <w msd="langle">&lt;</w>
37
+ <w msd="xmltags">&lt;/*\w*&gt;</w>
38
+ <w msd="equal">=</w>
39
+ <w msd="rangle">&gt;</w>
40
+ <w msd="aroba">@</w>
41
+ <w msd="number1">[+]?([0-9]*)?[0-9]+([\.,]\d+)*</w>
42
+ <!--
43
+ <w msd="number2">[+]?([0-9]*)?[0-9]+([\.,]\d+)*(\s|tỉ|tỷ|triệu|ngàn|nghìn|trăm|chục)*</w>
44
+ -->
45
+ <w msd="degree">[-+]?([0-9]*[\.,])?[0-9]+°</w>
46
+ <w msd="ponctuation">[\\?!\\.:;,\-"']</w>
47
+ <w msd="dollar">\$</w>
48
+ <w msd="lparen">\(</w>
49
+ <w msd="rparen">\)</w>
50
+ <w msd="asterisk">\*</w>
51
+ <w msd="plus">\+</w>
52
+ <w msd="minus">\-</w>
53
+ <w msd="ellipsis">\.\.\.</w>
54
+ <w msd="residual">\W</w>
55
+ <w msd="lbracket">\[</w>
56
+ <w msd="bslash">\\</w>
57
+ <w msd="rbracket">\]</w>
58
+ <w msd="entity0">\d+([\.,]\d+)*[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+\d+$</w>
59
+ <w msd="entity1">[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+(\d)*$</w>
60
+ <w msd="entity2">[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\d]+([\.\-/][\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*[\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+</w>
61
+ <w msd="space">\s+</w>
62
+ <w msd="word">\w</w>
63
+ <w msd="lcbrace">\{</w>
64
+ <w msd="rcbrace">\}</w>
65
+ <w msd="underscore">_</w>
66
+ <w msd="pound">£</w>
67
+ </body>
68
+ </corpus>
Binary file
@@ -0,0 +1,36 @@
1
+ ## tagger training invoked at Thu Aug 05 14:33:50 CEST 2010 with arguments:
2
+ model = resources/models/vtb.tagger
3
+ arch = left5words,vietnameseunknowns
4
+ trainFile = data/vtb-20091030.tagged.txt
5
+ closedClassTags =
6
+ closedClassTagThreshold = 40
7
+ curWordMinFeatureThresh = 2
8
+ debug = true
9
+ debugPrefix =
10
+ tagSeparator = /
11
+ encoding = UTF-8
12
+ initFromTrees = false
13
+ iterations = 100
14
+ lang = vietnamese
15
+ learnClosedClassTags = false
16
+ minFeatureThresh = 2
17
+ openClassTags =
18
+ rareWordMinFeatureThresh = 10
19
+ rareWordThresh = 6
20
+ search = qn
21
+ sgml = false
22
+ sigmaSquared = 0.5
23
+ regL1 = 1.0
24
+ tagInside =
25
+ tokenize = false
26
+ tokenizerFactory =
27
+ tokenizerOptions =
28
+ treeRange =
29
+ treeNormalizer =
30
+ treeTransformer =
31
+ verbose = false
32
+ veryCommonWordThresh = 50
33
+ xmlInput =
34
+ outputFile =
35
+ outputFormat = slashTags
36
+ outputFormatOptions =
@@ -0,0 +1,15 @@
1
+ òa oà
2
+ óa oá
3
+ ỏa oả
4
+ õa oã
5
+ ọa oạ
6
+ òe oè
7
+ óe oé
8
+ ỏe oẻ
9
+ õe oẽ
10
+ ọe oẹ
11
+ ùy uỳ
12
+ úy uý
13
+ ủy uỷ
14
+ ũy uỹ
15
+ ụy uỵ
@@ -0,0 +1,34 @@
1
+ <?xml version="1.0" encoding="utf-8" standalone="yes"?>
2
+ <corpus id="resources/prefix/namedEntityPrefix.xml">
3
+ <body>
4
+ <!-- classificator prefix -->
5
+ <w>ông</w>
6
+ <w>bà</w>
7
+ <w>bác</w>
8
+ <w>chú</w>
9
+ <w>cô</w>
10
+ <w>thím</w>
11
+ <w>dì</w>
12
+ <w>cậu</w>
13
+ <w>cụ</w>
14
+ <w>mợ</w>
15
+ <w>ngài</w>
16
+ <w>anh</w>
17
+ <w>chị</w>
18
+ <w>thằng</w>
19
+ <w>cái</w>
20
+ <w>vua</w>
21
+ <!-- organization prefix -->
22
+ <w>tỉnh</w>
23
+ <w>huyện</w>
24
+ <w>xã</w>
25
+ <w>phường</w>
26
+ <w>bộ</w>
27
+ <w>sở</w>
28
+ <w>cục</w>
29
+ <w>ban</w>
30
+ <w>ngành</w>
31
+ <!-- Other prefix -->
32
+ <w>theo</w>
33
+ </body>
34
+ </corpus>
@@ -0,0 +1,9 @@
1
+ require 'byebug'
2
+
3
+ spec_folder = File.dirname(__FILE__)
4
+ root_folder = File.dirname(spec_folder)
5
+ app_folder = File.join(root_folder, 'lib', '*.rb')
6
+ Dir[app_folder].each { |file| require file }
7
+
8
+ RSpec.configure do |config|
9
+ end
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+ require 'byebug'
3
+
4
+ describe VnTagger::Tagger do
5
+ describe '#tag' do
6
+ let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
7
+ let(:tagger) { described_class.new(text) }
8
+ let(:result) { tagger.tag }
9
+
10
+ it 'returns xml tagged text' do
11
+ expect(result).to be_a(Nokogiri::XML::Document)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe VnTagger do
4
+ describe '#tokenize' do
5
+ end
6
+ end
Binary file
data/vnTagger.sh ADDED
@@ -0,0 +1,28 @@
1
+ #!/bin/sh
2
+
3
+ # The main program to run
4
+ PROGRAM="${BASH_SOURCE%/*}/vn.hus.nlp.tagger-4.2.0.jar"
5
+
6
+ # Get the java command
7
+ #
8
+ if [ -z "$JAVACMD" ] ; then
9
+ if [ -n "$JAVA_HOME" ] ; then
10
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
11
+ JAVACMD="$JAVA_HOME/jre/sh/java"
12
+ else
13
+ JAVACMD="$JAVA_HOME/bin/java"
14
+ fi
15
+ else
16
+ JAVACMD=`which java 2> /dev/null`
17
+ if [ -z "$JAVACMD" ] ; then
18
+ JAVACMD=java
19
+ fi
20
+ fi
21
+ fi
22
+
23
+ # Run the programme
24
+ #
25
+ $JAVACMD -mx500m -jar $PROGRAM $@
26
+
27
+
28
+
data/vn_tagger.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'vn_tagger/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "vn_tagger"
8
+ spec.version = VnTagger::VERSION
9
+ spec.authors = ["Hieu Nguyen"]
10
+ spec.email = ["hieuk09@gmail.com"]
11
+ spec.summary = %q{This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.' }
12
+ spec.description = %q{This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.' }
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib", 'lib/vn_tagger']
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "byebug"
25
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vn_tagger
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Hieu Nguyen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: 'This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese
70
+ texts.'' '
71
+ email:
72
+ - hieuk09@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - VN_TAGGER LICENSE.txt
83
+ - lib/commons-cli-1.2.jar
84
+ - lib/edu.stanford.nlp.tagger-2.0.jar
85
+ - lib/vn.hus.nlp.fsm-1.0.0.jar
86
+ - lib/vn.hus.nlp.tokenizer-4.1.1.jar
87
+ - lib/vn.hus.nlp.utils-1.0.0.jar
88
+ - lib/vn_tagger.rb
89
+ - lib/vn_tagger/tagger.rb
90
+ - lib/vn_tagger/version.rb
91
+ - resources/automata/dfaLexicon.xml
92
+ - resources/automata/externalLexicon.xml
93
+ - resources/bigram/bigram.xml
94
+ - resources/bigram/unigram.xml
95
+ - resources/lexers/lexers.xml
96
+ - resources/models/vtb.tagger
97
+ - resources/models/vtb.tagger.props
98
+ - resources/normalization/rules.txt
99
+ - resources/prefix/namedEntityPrefix.xml
100
+ - spec/spec_helper.rb
101
+ - spec/vn_tagger/tagger_spec.rb
102
+ - spec/vn_tagger_spec.rb
103
+ - vn.hus.nlp.tagger-4.2.0.jar
104
+ - vnTagger.sh
105
+ - vn_tagger.gemspec
106
+ homepage: ''
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ - lib/vn_tagger
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.4.3
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.'
131
+ test_files:
132
+ - spec/spec_helper.rb
133
+ - spec/vn_tagger/tagger_spec.rb
134
+ - spec/vn_tagger_spec.rb