lulalala_address_tokenizer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 37508767abe6d7128484245eae8ab83ff0baf168
4
+ data.tar.gz: 78e8693219023cf0e6223335b250d69808ccfc37
5
+ SHA512:
6
+ metadata.gz: abcd4427aa3d668dd846f7cda3a23810348d1abc70aeac1875d383150ebec3d9d0d5cc56a57bdd8656bde4234efe74584499fd7ca45a8c0cc896561faac5afe7
7
+ data.tar.gz: c008a2ec2f92cfa4ec7456ec965f456ab4b87e5931ba0f1c78c4b04055dee0e0163141ad02a0ffa3354c618f5d7c56d434a6eba21b75a6be4389eb72d99a1207
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in lulalala_address_tokenizer.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # LulalalaAddressTokenizer
2
+
3
+ Postal addresses tokenizer using Wapiti model.
4
+
5
+ Intended for addresses in CJK (Chinese, Japanese Korean) characters.
6
+ After wapiti model labels each token(character), this gem combines adjacent word of the same label together.
7
+ This is important for CJK languages because its phrases (combination of words) are not separated by spaces.
8
+
9
+ 台灣地址分詞用
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'lulalala_address_tokenizer'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install lulalala_address_tokenizer
26
+
27
+ ## Usage
28
+
29
+ ```
30
+ tokenizer = LulalalaAddressTokenizer.new('address.mod')
31
+ tokenizer.parse("AA縣BB鎮CC路D號")
32
+ # {"city"=>"AA縣", "district"=>"BB鎮", "street"=>"CC路", "housenumber"=>"D號"}
33
+ ```
34
+
35
+ ## Contributing
36
+
37
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/lulalala_address_tokenizer.
38
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "lulalala_address_tokenizer"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,45 @@
1
+ require "lulalala_address_tokenizer/version"
2
+ require "wapiti"
3
+
4
+ class LulalalaAddressTokenizer
5
+ # @params model_path [#read]
6
+ def initialize(model_path)
7
+ @model = Wapiti.load(model_path)
8
+ end
9
+
10
+ # @params address [String]
11
+ def parse(address)
12
+ segments = []
13
+ address.chars.each do |ch|
14
+ segments << "#{ch} n"
15
+ end
16
+ result = @model.label [segments]
17
+ join(result.first)
18
+ end
19
+
20
+ private
21
+
22
+ def join(segments)
23
+ result = {}
24
+
25
+ current_phase = ''
26
+ previous_mark = nil
27
+ segments.each do |s|
28
+ mark = s.last
29
+ ss = s.first.split(' ')
30
+ word = ss.first
31
+
32
+ if previous_mark == mark
33
+ current_phase << word
34
+ else
35
+ result[previous_mark] = current_phase
36
+ previous_mark = mark
37
+ current_phase = word
38
+ end
39
+ end
40
+ result[previous_mark] = current_phase
41
+
42
+ result.shift
43
+ result
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ class LulalalaAddressTokenizer
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'lulalala_address_tokenizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "lulalala_address_tokenizer"
8
+ spec.version = LulalalaAddressTokenizer::VERSION
9
+ spec.authors = ["lulalala"]
10
+ spec.email = ["mark@goodlife.tw"]
11
+
12
+ spec.summary = %q{Postal addresses tokenizer using Wapiti model}
13
+ spec.description = %q{Postal addresses tokenizer using Wapiti model}
14
+ spec.homepage = "https://github.com/lulalala/address_tokenizer"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.13"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+
26
+ spec.add_dependency "wapiti"
27
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lulalala_address_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - lulalala
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-11-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.13'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.13'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: wapiti
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Postal addresses tokenizer using Wapiti model
56
+ email:
57
+ - mark@goodlife.tw
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - Gemfile
64
+ - README.md
65
+ - Rakefile
66
+ - bin/console
67
+ - bin/setup
68
+ - lib/lulalala_address_tokenizer.rb
69
+ - lib/lulalala_address_tokenizer/version.rb
70
+ - lulalala_address_tokenizer.gemspec
71
+ homepage: https://github.com/lulalala/address_tokenizer
72
+ licenses: []
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 2.4.5
91
+ signing_key:
92
+ specification_version: 4
93
+ summary: Postal addresses tokenizer using Wapiti model
94
+ test_files: []
95
+ has_rdoc: