lulalala_address_tokenizer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 37508767abe6d7128484245eae8ab83ff0baf168
4
+ data.tar.gz: 78e8693219023cf0e6223335b250d69808ccfc37
5
+ SHA512:
6
+ metadata.gz: abcd4427aa3d668dd846f7cda3a23810348d1abc70aeac1875d383150ebec3d9d0d5cc56a57bdd8656bde4234efe74584499fd7ca45a8c0cc896561faac5afe7
7
+ data.tar.gz: c008a2ec2f92cfa4ec7456ec965f456ab4b87e5931ba0f1c78c4b04055dee0e0163141ad02a0ffa3354c618f5d7c56d434a6eba21b75a6be4389eb72d99a1207
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in lulalala_address_tokenizer.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # LulalalaAddressTokenizer
2
+
3
+ Postal addresses tokenizer using Wapiti model.
4
+
5
+ Intended for addresses in CJK (Chinese, Japanese Korean) characters.
6
+ After wapiti model labels each token(character), this gem combines adjacent word of the same label together.
7
+ This is important for CJK languages because its phrases (combination of words) are not separated by spaces.
8
+
9
+ 台灣地址分詞用
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'lulalala_address_tokenizer'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install lulalala_address_tokenizer
26
+
27
+ ## Usage
28
+
29
+ ```
30
+ tokenizer = LulalalaAddressTokenizer.new('address.mod')
31
+ tokenizer.parse("AA縣BB鎮CC路D號")
32
+ # {"city"=>"AA縣", "district"=>"BB鎮", "street"=>"CC路", "housenumber"=>"D號"}
33
+ ```
34
+
35
+ ## Contributing
36
+
37
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/lulalala_address_tokenizer.
38
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "lulalala_address_tokenizer"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,45 @@
1
+ require "lulalala_address_tokenizer/version"
2
+ require "wapiti"
3
+
4
+ class LulalalaAddressTokenizer
5
+ # @params model_path [#read]
6
+ def initialize(model_path)
7
+ @model = Wapiti.load(model_path)
8
+ end
9
+
10
+ # @params address [String]
11
+ def parse(address)
12
+ segments = []
13
+ address.chars.each do |ch|
14
+ segments << "#{ch} n"
15
+ end
16
+ result = @model.label [segments]
17
+ join(result.first)
18
+ end
19
+
20
+ private
21
+
22
+ def join(segments)
23
+ result = {}
24
+
25
+ current_phase = ''
26
+ previous_mark = nil
27
+ segments.each do |s|
28
+ mark = s.last
29
+ ss = s.first.split(' ')
30
+ word = ss.first
31
+
32
+ if previous_mark == mark
33
+ current_phase << word
34
+ else
35
+ result[previous_mark] = current_phase
36
+ previous_mark = mark
37
+ current_phase = word
38
+ end
39
+ end
40
+ result[previous_mark] = current_phase
41
+
42
+ result.shift
43
+ result
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ class LulalalaAddressTokenizer
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'lulalala_address_tokenizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "lulalala_address_tokenizer"
8
+ spec.version = LulalalaAddressTokenizer::VERSION
9
+ spec.authors = ["lulalala"]
10
+ spec.email = ["mark@goodlife.tw"]
11
+
12
+ spec.summary = %q{Postal addresses tokenizer using Wapiti model}
13
+ spec.description = %q{Postal addresses tokenizer using Wapiti model}
14
+ spec.homepage = "https://github.com/lulalala/address_tokenizer"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.13"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+
26
+ spec.add_dependency "wapiti"
27
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lulalala_address_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - lulalala
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-11-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.13'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.13'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: wapiti
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Postal addresses tokenizer using Wapiti model
56
+ email:
57
+ - mark@goodlife.tw
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - Gemfile
64
+ - README.md
65
+ - Rakefile
66
+ - bin/console
67
+ - bin/setup
68
+ - lib/lulalala_address_tokenizer.rb
69
+ - lib/lulalala_address_tokenizer/version.rb
70
+ - lulalala_address_tokenizer.gemspec
71
+ homepage: https://github.com/lulalala/address_tokenizer
72
+ licenses: []
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 2.4.5
91
+ signing_key:
92
+ specification_version: 4
93
+ summary: Postal addresses tokenizer using Wapiti model
94
+ test_files: []
95
+ has_rdoc: