wordcuta 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wordcuta.rb ADDED
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+ require "ffi"
3
+
4
+ module WordcutFFI
5
+ class TextRange < FFI::Struct
6
+ layout :s, :size_t,
7
+ :e, :size_t
8
+ end
9
+
10
+ extend FFI::Library
11
+
12
+ ffi_lib "wordcutw"
13
+
14
+ attach_function :wordcut_new_with_dict, [:string], :pointer
15
+ attach_function :wordcut_into_text_ranges, [:pointer, :string, :pointer], :pointer
16
+ attach_function :wordcut_into_strings, [:pointer, :string, :pointer], :pointer
17
+ attach_function :wordcut_put_delimiters, [:pointer, :string, :string], :string
18
+ attach_function :delete_wordcut, [:pointer], :void
19
+ attach_function :delete_strings, [:pointer, :size_t], :void
20
+ end
21
+
22
+ module WordcutA
23
+ TextRange = Struct.new(:s, :e)
24
+
25
+ DEFAULT_THAI_DICT_PATH = File.expand_path('../../data/thai-dix.txt', __FILE__)
26
+
27
+ class Wordcut
28
+ def initialize(dict_path)
29
+ @wordcut_p = FFI::AutoPointer.new(WordcutFFI.wordcut_new_with_dict(dict_path),
30
+ WordcutFFI.method(:delete_wordcut))
31
+ end
32
+
33
+ def into_ranges(text)
34
+ ranges = []
35
+ FFI::MemoryPointer.new(:size_t, 1) do |range_cnt|
36
+ ranges_p = WordcutFFI.wordcut_into_text_ranges(@wordcut_p, text, range_cnt)
37
+ ranges = range_cnt.get_uint(0)
38
+ .times
39
+ .map {WordcutFFI::TextRange.new(ranges_p + _1 * WordcutFFI::TextRange.size)}
40
+ .map {TextRange.new(_1[:s], _1[:e])}
41
+ end
42
+ return ranges
43
+ end
44
+
45
+ def into_strings(text)
46
+ words = []
47
+ FFI::MemoryPointer.new(:size_t, 1) do |words_cnt_p|
48
+ words_p = WordcutFFI::wordcut_into_strings(@wordcut_p, text, words_cnt_p)
49
+ words_cnt = words_cnt_p.get_uint(0)
50
+ words = words_p.get_array_of_string(0, words_cnt)
51
+ WordcutFFI::delete_strings(words_p, words_cnt)
52
+ end
53
+ return words.map {_1.force_encoding("UTF-8")}
54
+ end
55
+
56
+ def put_delimiters(text, delim)
57
+ WordcutFFI::wordcut_put_delimiters(@wordcut_p, text, delim).force_encoding("UTF-8")
58
+ end
59
+ end
60
+ end
61
+
62
+ # wordcut = WordcutA::Wordcut.new("thai.txt")
63
+ # p wordcut.into_ranges("กากาม")
64
+ # p wordcut.into_strings("กากาขาม")
65
+ # p wordcut.put_delimiters("กากาขามมาก", "---")
data/wordcuta.gemspec ADDED
@@ -0,0 +1,12 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'wordcuta'
3
+ s.version = '0.1.0'
4
+ s.authors = ['Vee Satayamas']
5
+ s.email = ['5ssgdxltv@relay.firefox.com']
6
+ s.licenses = ['LGPL-3.0']
7
+ s.description = "A word segmentation tools for ASEAN languages wrapper for Ruby"
8
+ s.homepage = "https://github.com/veer66/wordcuta"
9
+ s.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
10
+ s.summary = "A word segmentation tools for ASEAN languages wrapper for Ruby"
11
+ s.files = %w(README.md LICENSE Gemfile wordcuta.gemspec data/thai-dix.txt lib/wordcuta.rb)
12
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wordcuta
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Vee Satayamas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-04-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A word segmentation tools for ASEAN languages wrapper for Ruby
14
+ email:
15
+ - 5ssgdxltv@relay.firefox.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - Gemfile
21
+ - LICENSE
22
+ - README.md
23
+ - data/thai-dix.txt
24
+ - lib/wordcuta.rb
25
+ - wordcuta.gemspec
26
+ homepage: https://github.com/veer66/wordcuta
27
+ licenses:
28
+ - LGPL-3.0
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: 2.3.0
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubygems_version: 3.2.3
46
+ signing_key:
47
+ specification_version: 4
48
+ summary: A word segmentation tools for ASEAN languages wrapper for Ruby
49
+ test_files: []