RubyGems - annex_29 - Versions diffs - 0.1.1 → 0.2.0 - Mend

annex_29 0.1.1 → 0.2.0

Files changed (24) hide show

checksums.yaml +5 -5
data/.github/workflows/ci.yml +26 -0
data/.github/workflows/cla.yml +22 -0
data/.gitignore +2 -0
data/.rspec +2 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +38 -0
data/Gemfile +3 -0
data/Gemfile.lock +35 -0
data/LICENSE.md +64 -0
data/README.md +3 -0
data/Rakefile +28 -0
data/annex_29.gemspec +28 -0
data/bin/rake +17 -0
data/bin/rspec +17 -0
data/data/Blocks.txt +309 -0
data/data/LineBreak.txt +3269 -0
data/data/Scripts.txt +2632 -0
data/data/WordBreakProperty.txt +1298 -0
data/data/WordBreakTest.txt +2084 -0
data/lib/annex_29/version.rb +5 -0
data/lib/annex_29/word_segmentation.rl.erb +190 -0
data/lib/annex_29.rb +4 -0
metadata +32 -12

data/lib/annex_29/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Annex29
+  VERSION = "0.2.0"
+end

data/lib/annex_29/word_segmentation.rl.erb ADDED Viewed

@@ -0,0 +1,190 @@
+<%
+  require("pathname")
+  property_regex = %r{
+    ^
+    (?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
+    \s*
+    ;
+    \s*
+    (?<category>[^\#]+)
+  }x
+  general_category_regex = %r{
+    ^
+    (?<lower_bound>\h+)(?:\.\.(?<upper_bound>\h+))?
+    \s*
+    ;
+    \s*
+    (?:[^\#]+)
+    \#.
+    (?<category>[A-Z][a-z])
+  }x
+  parse_unicode_data = ->(file_name, regex = property_regex) do
+    data = Hash.new { |hash, key| hash[key] = [] }
+    File.open(Pathname.new("data").join(file_name)).each_line do |line|
+      next unless match = line.match(regex)
+      lower_bound = match["lower_bound".freeze]
+      category = match["category".freeze].strip.downcase.gsub(" ", "_").intern
+      if upper_bound = match["upper_bound".freeze]
+        data[category] << "0x#{lower_bound}..0x#{upper_bound}"
+      else
+        data[category] << "0x#{lower_bound}"
+      end
+    end
+    data
+  end
+  block_map = parse_unicode_data.("Blocks.txt")
+  line_break_map = parse_unicode_data.("LineBreak.txt")
+  script_map = parse_unicode_data.("Scripts.txt")
+  general_category_map = parse_unicode_data.("Scripts.txt", general_category_regex)
+  word_break_property_map = parse_unicode_data.("WordBreakProperty.txt")
+%>
+module Annex29
+  module WordSegmentation
+    %%{
+      machine segmenter;
+      alphtype int;
+      action word {
+        words << data[ts...te].pack("U*")
+      }
+      nd = (<%= general_category_map[:nd].join("|") %>);
+      block_half_and_full_forms = (<%= block_map[:halfwidth_and_fullwidth_forms].join("|") %>);
+      lb_complex_context = (<%= line_break_map[:sa].join("|") %>);
+      script_han = (<%= script_map[:han].join("|") %>);
+      script_hangul = (<%= script_map[:hangul].join("|") %>);
+      script_hiragana = (<%= script_map[:hiragana].join("|") %>);
+      <% word_break_property_map.each do |category, code_point_range| %>
+        <%= "wb_#{category}" %> = (<%= code_point_range.join("|") %>);
+      <% end %>
+      sticky = (wb_format | wb_extend | wb_zwj)*;
+      complex_context_ex = lb_complex_context sticky;
+      double_quote_ex = wb_double_quote sticky;
+      e_base_ex = wb_e_base sticky;
+      e_base_gaz_ex = wb_e_base_gaz sticky;
+      e_modifier_ex = wb_e_modifier sticky;
+      extend_num_let_ex = wb_extendnumlet sticky;
+      han_ex = script_han sticky;
+      hangul_ex = (script_hangul & (wb_aletter | wb_hebrew_letter)) sticky;
+      hebrew_letter_ex = wb_hebrew_letter sticky;
+      hebrew_or_aletter_ex = (wb_aletter | wb_hebrew_letter) sticky;
+      hiragana_ex = script_hiragana sticky;
+      katakana_ex = wb_katakana sticky;
+      mid_letter_ex = (wb_midletter | wb_midnumlet | wb_single_quote) sticky;
+      mid_numeric_ex = (wb_midnum | wb_midnumlet | wb_single_quote) sticky;
+      numeric_ex = (wb_numeric | (block_half_and_full_forms & nd)) sticky;
+      regional_indicator_ex = wb_regional_indicator sticky;
+      single_quote_ex = wb_single_quote sticky;
+      numeric =
+        extend_num_let_ex*
+        numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*
+        extend_num_let_ex*;
+      hangul = hangul_ex+;
+      katakana = katakana_ex+;
+      south_east_asian = complex_context_ex+;
+      ideographic = han_ex;
+      hiragana = hiragana_ex;
+      extend_num_let = extend_num_let_ex+;
+      inner_word =
+        (katakana_ex (extend_num_let_ex* katakana_ex)*) |
+        (
+          (hebrew_letter_ex (single_quote_ex | (double_quote_ex hebrew_letter_ex))) |
+          (numeric_ex ((extend_num_let_ex* | mid_numeric_ex) numeric_ex)*) |
+          (hebrew_or_aletter_ex ((extend_num_let_ex* | mid_letter_ex) hebrew_or_aletter_ex)*)
+        )+;
+      word =
+        extend_num_let_ex*
+        inner_word
+        (extend_num_let_ex+ inner_word)*
+        extend_num_let_ex*;
+      newline =
+        wb_cr wb_lf |
+        wb_lf |
+        wb_cr |
+        wb_newline;
+      flags = regional_indicator_ex regional_indicator_ex;
+      emoji =
+        e_base_ex e_modifier_ex? |
+        wb_zwj? e_base_gaz_ex e_modifier_ex? |
+        wb_zwj wb_glue_after_zwj sticky;
+      word_like =
+        numeric |
+        hangul |
+        katakana |
+        word |
+        south_east_asian |
+        ideographic |
+        hiragana |
+        extend_num_let |
+        flags |
+        emoji |
+        newline |
+        ^(newline) sticky |
+        sticky |
+        any;
+      main := |*
+        word_like => word;
+      *|;
+    }%%
+    %% write data;
+    class << self
+      def call(input)
+        data = input.each_char.map(&:ord)
+        eof = data.length
+        words = []
+        %% write init;
+        %% write exec;
+        words
+      end
+    end
+  end
+end

data/lib/annex_29.rb CHANGED Viewed

@@ -1,3 +1,7 @@
+# frozen_string_literal: true
+require 'pathname'
 module Annex29
   require("annex_29/word_segmentation")

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: annex_29
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
-- Simon Génier
+- Shopify
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-10-18 00:00:00.000000000 Z
+date: 2023-12-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '11.3'
+        version: '13.1'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '11.3'
+        version: '13.1'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -39,17 +39,38 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '3.5'
 description:
-email: simon.genier@shopify.com
+email: developers@shopify.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".github/workflows/ci.yml"
+- ".github/workflows/cla.yml"
+- ".gitignore"
+- ".rspec"
+- ".ruby-version"
+- CHANGELOG.md
+- Gemfile
+- Gemfile.lock
+- LICENSE.md
+- README.md
+- Rakefile
+- annex_29.gemspec
+- bin/rake
+- bin/rspec
+- data/Blocks.txt
+- data/LineBreak.txt
+- data/Scripts.txt
+- data/WordBreakProperty.txt
+- data/WordBreakTest.txt
 - lib/annex_29.rb
+- lib/annex_29/version.rb
 - lib/annex_29/word_segmentation.rb
+- lib/annex_29/word_segmentation.rl.erb
 homepage: https://github.com/Shopify/annex-29
-licenses:
-- Apache-2.0
-metadata: {}
+licenses: []
+metadata:
+  allowed_push_host: https://rubygems.org/
 post_install_message:
 rdoc_options: []
 require_paths:
@@ -58,15 +79,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 3.2.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 3.4.22
 signing_key:
 specification_version: 4
 summary: Unicode annex 29 compliant word segmentation