vn_tagger 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,68 @@
1
+ <?xml version="1.0" encoding="utf-8" standalone="yes"?>
2
+ <!-- (C) phuonglh@gmail.com -->
3
+ <corpus id="resources/lexers/lexers.xml">
4
+ <body>
5
+ <w msd="numbersign">#</w>
6
+ <w msd="ampersand">&amp;</w>
7
+ <w msd="date_mm-dd-yy">(0*[1-9]|1[012])-(0*[1-9]|[12][0-9]|3[01])-\d\d</w>
8
+ <w msd="date_mm/dd/yy">(0*[1-9]|1[012])/(0*[1-9]|[12][0-9]|3[01])/\d\d</w>
9
+ <w msd="date_mm.dd.yy">(0*[1-9]|1[012])\.(0*[1-9]|[12][0-9]|3[01])\.\d\d</w>
10
+ <w msd="date_dd-mm-yy">([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-\d\d</w>
11
+ <w msd="date_dd/mm/yy">([12][0-9]|3[01]|0*[1-9])/(1[012]||0*[1-9])/\d\d</w>
12
+ <w msd="date_dd.mm.yy">([12][0-9]|3[01]|0*[1-9])[\.](1[012]||0*[1-9])[\.]\d\d</w>
13
+ <w msd="date_dd-mm-yyyy">([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-(19|20)\d\d</w>
14
+ <w msd="date_dd/mm/yyyy">([12][0-9]|3[01]|0*[1-9])/(1[012]||0*[1-9])/(19|20)\d\d</w>
15
+ <w msd="date_dd.mm.yyyy">([12][0-9]|3[01]|0*[1-9])\.(1[012]||0*[1-9])\.(19|20)\d\d</w>
16
+ <w msd="date_dd-mm">(0*[1-9]|[12][0-9]|3[01])[-/\.](1[012]|0*[1-9])</w>
17
+ <w msd="date_mm-yy">(0*[1-9]|1[012])[-/\.]\d\d</w>
18
+ <w msd="date_mm-yyyy">(0*[1-9]|1[012])[-/\.](19|20)\d\d</w>
19
+ <w msd="date_yyyy">(19|20)\d\d</w>
20
+ <w msd="date_mm-dd-yyyy">(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])-(19|20)\d\d</w>
21
+ <w msd="date_mm/dd/yyyy">(0*[1-9]|1[012])/([12][0-9]|3[01]|0*[1-9])/(19|20)\d\d</w>
22
+ <w msd="date_mm.dd.yyyy">(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])\.(19|20)\d\d</w>
23
+ <w msd="date_yyyy-mm-dd">(19|20)\d\d-(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])</w>
24
+ <w msd="date_yyyy/mm/dd">(19|20)\d\d/(0*[1-9]|1[012])/([12][0-9]|3[01]|0*[1-9])</w>
25
+ <w msd="date_yyyy.mm.dd">(19|20)\d\d\.(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])</w>
26
+ <w msd="hhmmss">([0-1]\d|[2][0-3]):[0-5]\d:[0-5]\d</w>
27
+ <w msd="percent">([0-9]*[\.,])?[0-9]+%</w>
28
+ <w msd="name1">[A-ZÁÂĐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*</w>
29
+ <w msd="name2">([A-ZÁÂĐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*)(\s+[A-ZÁÂĐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*</w>
30
+ <w msd="phrase">([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ])?([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\s])*([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz])+$*</w>
31
+ <w msd="allcaps">([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)(\s*[AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)*[^aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz/\)\(\?!\.;:,\-"']</w>
32
+ <w msd="fraction">(\d+)/(\d+)</w>
33
+ <w msd="email">(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})</w>
34
+ <w msd="return">(^$)</w>
35
+ <w msd="fslash">/</w>
36
+ <w msd="langle">&lt;</w>
37
+ <w msd="xmltags">&lt;/*\w*&gt;</w>
38
+ <w msd="equal">=</w>
39
+ <w msd="rangle">&gt;</w>
40
+ <w msd="aroba">@</w>
41
+ <w msd="number1">[+]?([0-9]*)?[0-9]+([\.,]\d+)*</w>
42
+ <!--
43
+ <w msd="number2">[+]?([0-9]*)?[0-9]+([\.,]\d+)*(\s|tỉ|tỷ|triệu|ngàn|nghìn|trăm|chục)*</w>
44
+ -->
45
+ <w msd="degree">[-+]?([0-9]*[\.,])?[0-9]+°</w>
46
+ <w msd="ponctuation">[\\?!\\.:;,\-"']</w>
47
+ <w msd="dollar">\$</w>
48
+ <w msd="lparen">\(</w>
49
+ <w msd="rparen">\)</w>
50
+ <w msd="asterisk">\*</w>
51
+ <w msd="plus">\+</w>
52
+ <w msd="minus">\-</w>
53
+ <w msd="ellipsis">\.\.\.</w>
54
+ <w msd="residual">\W</w>
55
+ <w msd="lbracket">\[</w>
56
+ <w msd="bslash">\\</w>
57
+ <w msd="rbracket">\]</w>
58
+ <w msd="entity0">\d+([\.,]\d+)*[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+\d+$</w>
59
+ <w msd="entity1">[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+(\d)*$</w>
60
+ <w msd="entity2">[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\d]+([\.\-/][\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*[\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+</w>
61
+ <w msd="space">\s+</w>
62
+ <w msd="word">\w</w>
63
+ <w msd="lcbrace">\{</w>
64
+ <w msd="rcbrace">\}</w>
65
+ <w msd="underscore">_</w>
66
+ <w msd="pound">£</w>
67
+ </body>
68
+ </corpus>
Binary file
@@ -0,0 +1,36 @@
1
+ ## tagger training invoked at Thu Aug 05 14:33:50 CEST 2010 with arguments:
2
+ model = resources/models/vtb.tagger
3
+ arch = left5words,vietnameseunknowns
4
+ trainFile = data/vtb-20091030.tagged.txt
5
+ closedClassTags =
6
+ closedClassTagThreshold = 40
7
+ curWordMinFeatureThresh = 2
8
+ debug = true
9
+ debugPrefix =
10
+ tagSeparator = /
11
+ encoding = UTF-8
12
+ initFromTrees = false
13
+ iterations = 100
14
+ lang = vietnamese
15
+ learnClosedClassTags = false
16
+ minFeatureThresh = 2
17
+ openClassTags =
18
+ rareWordMinFeatureThresh = 10
19
+ rareWordThresh = 6
20
+ search = qn
21
+ sgml = false
22
+ sigmaSquared = 0.5
23
+ regL1 = 1.0
24
+ tagInside =
25
+ tokenize = false
26
+ tokenizerFactory =
27
+ tokenizerOptions =
28
+ treeRange =
29
+ treeNormalizer =
30
+ treeTransformer =
31
+ verbose = false
32
+ veryCommonWordThresh = 50
33
+ xmlInput =
34
+ outputFile =
35
+ outputFormat = slashTags
36
+ outputFormatOptions =
@@ -0,0 +1,15 @@
1
+ òa oà
2
+ óa oá
3
+ ỏa oả
4
+ õa oã
5
+ ọa oạ
6
+ òe oè
7
+ óe oé
8
+ ỏe oẻ
9
+ õe oẽ
10
+ ọe oẹ
11
+ ùy uỳ
12
+ úy uý
13
+ ủy uỷ
14
+ ũy uỹ
15
+ ụy uỵ
@@ -0,0 +1,34 @@
1
+ <?xml version="1.0" encoding="utf-8" standalone="yes"?>
2
+ <corpus id="resources/prefix/namedEntityPrefix.xml">
3
+ <body>
4
+ <!-- classificator prefix -->
5
+ <w>ông</w>
6
+ <w>bà</w>
7
+ <w>bác</w>
8
+ <w>chú</w>
9
+ <w>cô</w>
10
+ <w>thím</w>
11
+ <w>dì</w>
12
+ <w>cậu</w>
13
+ <w>cụ</w>
14
+ <w>mợ</w>
15
+ <w>ngài</w>
16
+ <w>anh</w>
17
+ <w>chị</w>
18
+ <w>thằng</w>
19
+ <w>cái</w>
20
+ <w>vua</w>
21
+ <!-- organization prefix -->
22
+ <w>tỉnh</w>
23
+ <w>huyện</w>
24
+ <w>xã</w>
25
+ <w>phường</w>
26
+ <w>bộ</w>
27
+ <w>sở</w>
28
+ <w>cục</w>
29
+ <w>ban</w>
30
+ <w>ngành</w>
31
+ <!-- Other prefix -->
32
+ <w>theo</w>
33
+ </body>
34
+ </corpus>
@@ -0,0 +1,9 @@
1
+ require 'byebug'
2
+
3
+ spec_folder = File.dirname(__FILE__)
4
+ root_folder = File.dirname(spec_folder)
5
+ app_folder = File.join(root_folder, 'lib', '*.rb')
6
+ Dir[app_folder].each { |file| require file }
7
+
8
+ RSpec.configure do |config|
9
+ end
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+ require 'byebug'
3
+
4
+ describe VnTagger::Tagger do
5
+ describe '#tag' do
6
+ let(:text) { 'HLV cùa Chelsea không hối tiếc vì hành động bỏ về sớm trong trận gặp Aston Villa.' }
7
+ let(:tagger) { described_class.new(text) }
8
+ let(:result) { tagger.tag }
9
+
10
+ it 'returns xml tagged text' do
11
+ expect(result).to be_a(Nokogiri::XML::Document)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe VnTagger do
4
+ describe '#tokenize' do
5
+ end
6
+ end
Binary file
data/vnTagger.sh ADDED
@@ -0,0 +1,28 @@
1
+ #!/bin/sh
2
+
3
+ # The main program to run
4
+ PROGRAM="${BASH_SOURCE%/*}/vn.hus.nlp.tagger-4.2.0.jar"
5
+
6
+ # Get the java command
7
+ #
8
+ if [ -z "$JAVACMD" ] ; then
9
+ if [ -n "$JAVA_HOME" ] ; then
10
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
11
+ JAVACMD="$JAVA_HOME/jre/sh/java"
12
+ else
13
+ JAVACMD="$JAVA_HOME/bin/java"
14
+ fi
15
+ else
16
+ JAVACMD=`which java 2> /dev/null`
17
+ if [ -z "$JAVACMD" ] ; then
18
+ JAVACMD=java
19
+ fi
20
+ fi
21
+ fi
22
+
23
+ # Run the programme
24
+ #
25
+ $JAVACMD -mx500m -jar $PROGRAM $@
26
+
27
+
28
+
data/vn_tagger.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'vn_tagger/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "vn_tagger"
8
+ spec.version = VnTagger::VERSION
9
+ spec.authors = ["Hieu Nguyen"]
10
+ spec.email = ["hieuk09@gmail.com"]
11
+ spec.summary = %q{This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.' }
12
+ spec.description = %q{This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.' }
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib", 'lib/vn_tagger']
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "byebug"
25
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vn_tagger
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Hieu Nguyen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: 'This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese
70
+ texts.'' '
71
+ email:
72
+ - hieuk09@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - VN_TAGGER LICENSE.txt
83
+ - lib/commons-cli-1.2.jar
84
+ - lib/edu.stanford.nlp.tagger-2.0.jar
85
+ - lib/vn.hus.nlp.fsm-1.0.0.jar
86
+ - lib/vn.hus.nlp.tokenizer-4.1.1.jar
87
+ - lib/vn.hus.nlp.utils-1.0.0.jar
88
+ - lib/vn_tagger.rb
89
+ - lib/vn_tagger/tagger.rb
90
+ - lib/vn_tagger/version.rb
91
+ - resources/automata/dfaLexicon.xml
92
+ - resources/automata/externalLexicon.xml
93
+ - resources/bigram/bigram.xml
94
+ - resources/bigram/unigram.xml
95
+ - resources/lexers/lexers.xml
96
+ - resources/models/vtb.tagger
97
+ - resources/models/vtb.tagger.props
98
+ - resources/normalization/rules.txt
99
+ - resources/prefix/namedEntityPrefix.xml
100
+ - spec/spec_helper.rb
101
+ - spec/vn_tagger/tagger_spec.rb
102
+ - spec/vn_tagger_spec.rb
103
+ - vn.hus.nlp.tagger-4.2.0.jar
104
+ - vnTagger.sh
105
+ - vn_tagger.gemspec
106
+ homepage: ''
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ - lib/vn_tagger
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.4.3
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: This is a wrapper for vn_tagger library, a A POS tagger for Vietnamese texts.'
131
+ test_files:
132
+ - spec/spec_helper.rb
133
+ - spec/vn_tagger/tagger_spec.rb
134
+ - spec/vn_tagger_spec.rb