srx 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +1 -1
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +2 -2
- data/README.md +5 -2
- data/lib/srx/format.rb +2 -1
- data/lib/srx/format/html.rb +41 -0
- data/lib/srx/version.rb +1 -1
- data/srx.gemspec +1 -0
- metadata +4 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
         | 
| 4 | 
            +
              data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
         | 
| 7 | 
            +
              data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
         | 
    
        data/.github/workflows/main.yml
    CHANGED
    
    
    
        data/.rubocop.yml
    CHANGED
    
    
    
        data/CHANGELOG.md
    ADDED
    
    
    
        data/Gemfile.lock
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -13,7 +13,10 @@ This gem provides facilities for reading SRX files and an engine for performing | |
| 13 13 | 
             
            segmentation.
         | 
| 14 14 |  | 
| 15 15 | 
             
            Only a minimal rule set is supplied by default; for actual usage you are
         | 
| 16 | 
            -
            encouraged to supply your own SRX rules.
         | 
| 16 | 
            +
            encouraged to supply your own SRX rules. One such set of rules is that from
         | 
| 17 | 
            +
            [LanguageTool](https://languagetool.org/); this is conveniently packaged into a
         | 
| 18 | 
            +
            companion gem:
         | 
| 19 | 
            +
            [srx-languagetool-ruby](https://github.com/amake/srx-languagetool-ruby).
         | 
| 17 20 |  | 
| 18 21 | 
             
            ## What's different about this gem?
         | 
| 19 22 |  | 
| @@ -43,7 +46,7 @@ Some disadvantages: | |
| 43 46 | 
             
              test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
         | 
| 44 47 | 
             
              scoring 47% (English) and 48% (others) with the default rules. However you can
         | 
| 45 48 | 
             
              improve on that with better rules such as
         | 
| 46 | 
            -
              [LanguageTool's](https://github.com/ | 
| 49 | 
            +
              [LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
         | 
| 47 50 |  | 
| 48 51 | 
             
            ## Installation
         | 
| 49 52 |  | 
    
        data/lib/srx/format.rb
    CHANGED
    
    | @@ -3,6 +3,7 @@ | |
| 3 3 | 
             
            require_relative 'format/base_format'
         | 
| 4 4 | 
             
            require_relative 'format/text'
         | 
| 5 5 | 
             
            require_relative 'format/xml'
         | 
| 6 | 
            +
            require_relative 'format/html'
         | 
| 6 7 |  | 
| 7 8 | 
             
            module Srx
         | 
| 8 9 | 
             
              # Format-specific data and logic
         | 
| @@ -10,7 +11,7 @@ module Srx | |
| 10 11 | 
             
                FORMATS = {
         | 
| 11 12 | 
             
                  text: Text.new,
         | 
| 12 13 | 
             
                  xml: Xml.new,
         | 
| 13 | 
            -
                  html:  | 
| 14 | 
            +
                  html: Html.new
         | 
| 14 15 | 
             
                }.freeze
         | 
| 15 16 |  | 
| 16 17 | 
             
                class << self
         | 
| @@ -0,0 +1,41 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'set'
         | 
| 4 | 
            +
            require 'English'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Srx
         | 
| 7 | 
            +
              module Format
         | 
| 8 | 
            +
                # Support for XML
         | 
| 9 | 
            +
                #
         | 
| 10 | 
            +
                # @see https://www.w3.org/TR/xml/
         | 
| 11 | 
            +
                class Html < Xml
         | 
| 12 | 
            +
                  START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  # A set of HTML tags that are "void elements", meaning they do not need a
         | 
| 15 | 
            +
                  # paired closing tag.
         | 
| 16 | 
            +
                  #
         | 
| 17 | 
            +
                  # @see https://html.spec.whatwg.org/#void-elements
         | 
| 18 | 
            +
                  # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command
         | 
| 19 | 
            +
                  # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/keygen
         | 
| 20 | 
            +
                  # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menuitem
         | 
| 21 | 
            +
                  VOID_ELEMENTS = Set[
         | 
| 22 | 
            +
                    'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
         | 
| 23 | 
            +
                    'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
         | 
| 24 | 
            +
                  ].freeze
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  def start_formatting?(markup)
         | 
| 27 | 
            +
                    START_TAG_CAPTURE.match(markup) do |m|
         | 
| 28 | 
            +
                      !VOID_ELEMENTS.include?(m['name'])
         | 
| 29 | 
            +
                    end
         | 
| 30 | 
            +
                  end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  def isolated_formatting?(markup)
         | 
| 33 | 
            +
                    return true if super(markup)
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    START_TAG_CAPTURE.match(markup) do |m|
         | 
| 36 | 
            +
                      VOID_ELEMENTS.include?(m['name'])
         | 
| 37 | 
            +
                    end
         | 
| 38 | 
            +
                  end
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
              end
         | 
| 41 | 
            +
            end
         | 
    
        data/lib/srx/version.rb
    CHANGED
    
    
    
        data/srx.gemspec
    CHANGED
    
    | @@ -15,6 +15,7 @@ Gem::Specification.new do |spec| | |
| 15 15 |  | 
| 16 16 | 
             
              spec.metadata['homepage_uri'] = spec.homepage
         | 
| 17 17 | 
             
              spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
         | 
| 18 | 
            +
              spec.metadata['changelog_uri'] = 'https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md'
         | 
| 18 19 |  | 
| 19 20 | 
             
              # Specify which files should be added to the gem when it is released.
         | 
| 20 21 | 
             
              # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: srx
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.2.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Aaron Madlon-Kay
         | 
| @@ -135,6 +135,7 @@ files: | |
| 135 135 | 
             
            - ".rubocop.yml"
         | 
| 136 136 | 
             
            - ".rubocop_todo.yml"
         | 
| 137 137 | 
             
            - ".solargraph.yml"
         | 
| 138 | 
            +
            - CHANGELOG.md
         | 
| 138 139 | 
             
            - Gemfile
         | 
| 139 140 | 
             
            - Gemfile.lock
         | 
| 140 141 | 
             
            - LICENSE.txt
         | 
| @@ -150,6 +151,7 @@ files: | |
| 150 151 | 
             
            - lib/srx/engine.rb
         | 
| 151 152 | 
             
            - lib/srx/format.rb
         | 
| 152 153 | 
             
            - lib/srx/format/base_format.rb
         | 
| 154 | 
            +
            - lib/srx/format/html.rb
         | 
| 153 155 | 
             
            - lib/srx/format/text.rb
         | 
| 154 156 | 
             
            - lib/srx/format/xml.rb
         | 
| 155 157 | 
             
            - lib/srx/icu_regex.rb
         | 
| @@ -163,6 +165,7 @@ licenses: | |
| 163 165 | 
             
            metadata:
         | 
| 164 166 | 
             
              homepage_uri: https://github.com/amake/srx-ruby
         | 
| 165 167 | 
             
              source_code_uri: https://github.com/amake/srx-ruby.git
         | 
| 168 | 
            +
              changelog_uri: https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md
         | 
| 166 169 | 
             
            post_install_message: 
         | 
| 167 170 | 
             
            rdoc_options: []
         | 
| 168 171 | 
             
            require_paths:
         |