html2asciimath 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: cf33dc42507a6cc5e93f188c5350c1917f756b4650ac57b1b1ea35c01d15ffc6
4
+ data.tar.gz: 51b88bae00fc7ec60138969f69dab9931dfb188acebee4f644933934572d02b6
5
+ SHA512:
6
+ metadata.gz: 12b27fae00e9f1cfddecc0c7002396c61ecc0f673653fcae0eb4152faeda091ceb57be153fa992ca6bf7ed8de9ddaadba81c61cba4047f12372f9e54de1b4b0a
7
+ data.tar.gz: 2d25c46e21b49ee2450c61c50b62769ffbc47c8edc3688d6ac169c6b9c785f026f956c94a15ca2ad5390be6d8441368dc4d9116020132978e789bcae97b0a489
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Sebastian Skałacki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # HTML2AsciiMath
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/html2asciimath`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'html2asciimath'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle install
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install html2asciimath
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/html2asciimath.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/html2asciimath/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "html2asciimath"
7
+ spec.version = HTML2AsciiMath::VERSION
8
+ spec.authors = ["Ribose"]
9
+ spec.email = ["open.source@ribose.com"]
10
+ spec.license = "MIT"
11
+
12
+ spec.summary = "Converts simple math formulae written in pure HTML " +
13
+ "to AsciiMath"
14
+
15
+ spec.homepage = "https://www.plurimath.org/"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/plurimath/html2asciimath"
19
+
20
+ all_files_in_git = Dir.chdir(File.expand_path(__dir__)) do
21
+ `git ls-files -z`.split("\x0")
22
+ end
23
+
24
+ # Specify which files should be added to the gem when it is released.
25
+ spec.files = all_files_in_git.select do |f|
26
+ f.start_with?("exe/", "lib/", "README.", "LICENSE.") ||
27
+ f.end_with?(".gemspec")
28
+ end
29
+
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.6.0")
35
+
36
+ spec.add_runtime_dependency "nokogiri"
37
+ spec.add_runtime_dependency "unicode_scanner", "~> 1.0"
38
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ module HTML2AsciiMath
6
+ # Abstract syntax tree implemented as array of arrays and other objects.
7
+ class AST < Array
8
+ module Refinements
9
+ refine Object do
10
+ def to_asciimath(**)
11
+ itself
12
+ end
13
+ end
14
+ end
15
+
16
+ using Refinements
17
+
18
+ def to_asciimath(child: false)
19
+ result = map { |item| item.to_asciimath(child: true) }.join(" ")
20
+ child && size > 1 ? "( #{result} )" : result
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ module HTML2AsciiMath
6
+ # This class is responsible for converting HTML math expressions to AsciiMath.
7
+ #
8
+ # It runs two small parsers: first HTMLparser deals with HTML syntax, and then
9
+ # HTMLTextParser processes textual content found between HTML elements.
10
+ # Thanks to this two-phase processing, HTMLTextParser receives input which is
11
+ # already decoded (i.e. without any HTML entities), which in turn allows to
12
+ # keep its grammar simple.
13
+ #
14
+ # @example
15
+ # html_string = "<i>x</i>+<i>y</i>"
16
+ # Converter.new(html_string).transform # => "x + y"
17
+ class Converter
18
+ attr_reader :ast, :ast_stack, :html_parser
19
+ attr_accessor :variable_mode
20
+
21
+ def initialize(str)
22
+ @html_parser = HTMLParser.new(str, self)
23
+ end
24
+
25
+ def transform
26
+ to_asciimath
27
+ end
28
+
29
+ def open_group
30
+ ast_stack.push AST.new
31
+ end
32
+
33
+ def close_group
34
+ push ast_stack.pop
35
+ end
36
+
37
+ def push(*objs)
38
+ ast_stack.last.push(*objs)
39
+ end
40
+
41
+ def to_asciimath
42
+ parse
43
+ ast.to_asciimath
44
+ end
45
+
46
+ def parse
47
+ return if @ast
48
+
49
+ @ast = AST.new
50
+ @ast_stack = [@ast]
51
+ @variable_mode = false
52
+ html_parser.parse
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require "unicode_scanner"
6
+
7
+ module HTML2AsciiMath
8
+ class Detector < UnicodeScanner
9
+ def initialize(str)
10
+ super(str.dup)
11
+ end
12
+
13
+ def replace(&block)
14
+ scan_for_math do |math_start, math_end, score|
15
+ range = (math_start...math_end)
16
+ source_math = string[range]
17
+ target_math = yield source_math
18
+ string[range] = target_math
19
+ self.pos += (target_math.size - source_math.size)
20
+ end
21
+ string
22
+ end
23
+
24
+ private
25
+
26
+ def scan_for_math
27
+ fast_forward_inline_whitespace
28
+
29
+ until eos? do
30
+ assess_candidate
31
+ yield [@start, @end, @score] if good_score?
32
+ fast_forward_to_next_candidate
33
+ end
34
+ end
35
+
36
+ def assess_candidate
37
+ init_candidate
38
+ nil while match_candidate_fragment
39
+ end
40
+
41
+ def match_candidate_fragment
42
+ fast_forward_inline_whitespace
43
+
44
+ case
45
+ when scan(FRAGMENT_WORD_AND_BRACKETS)
46
+ score_sure
47
+ when scan(FRAGMENT_SUB_OR_SUP)
48
+ score_almost_sure
49
+ when scan(FRAGMENT_B_OR_I)
50
+ single_char_or_entity?(self[:inner]) ? score_sure : score_maybe
51
+ when scan(FRAGMENT_OTHER)
52
+ score_maybe
53
+ end
54
+
55
+ matched?
56
+ end
57
+
58
+ def fast_forward_inline_whitespace
59
+ skip(/\p{Zs}*/)
60
+ end
61
+
62
+ def fast_forward_to_next_candidate
63
+ skip(/(?>\p{L}+|.)[[:space:]]*/m)
64
+ end
65
+
66
+ def init_candidate
67
+ @score = 0
68
+ @start = pos
69
+ end
70
+
71
+ def score_sure
72
+ @score += GOOD_SCORE_THRESHOLD
73
+ @end = pos
74
+ end
75
+
76
+ def score_almost_sure
77
+ @score += GOOD_SCORE_THRESHOLD - 1
78
+ @end = pos
79
+ end
80
+
81
+ def score_maybe
82
+ @score += 1
83
+ @end = pos
84
+ end
85
+
86
+ def good_score?
87
+ @score >= GOOD_SCORE_THRESHOLD
88
+ end
89
+
90
+ def single_char_or_entity?(str)
91
+ str.match? /\A(\S|#{RX_ENTITY})\Z/o
92
+ end
93
+
94
+ # HTML entity
95
+ RX_ENTITY = /&#?\w{,8};/
96
+
97
+ # Word immediately followed with brackets.
98
+ FRAGMENT_WORD_AND_BRACKETS = %r{
99
+ (?> \p{L}+)
100
+ (?: \( .*? \) | \[ .*? \] | \{ .*? \} )
101
+ }x
102
+
103
+ FRAGMENT_SUB_OR_SUP = %r{
104
+ \< sub \> .*? \< / sub \> |
105
+ \< sup \> .*? \< / sup \>
106
+ }xi
107
+
108
+ FRAGMENT_B_OR_I = %r{
109
+ (?<b_or_i>
110
+ (?> \< b \> ) (?: \g<b_or_i> | (?<inner> [^<>\p{Z}\p{C}]*?)) \< / b \>
111
+ |
112
+ (?> \< i \> ) (?: \g<b_or_i> | (?<inner> [^<>\p{Z}\p{C}]*?)) \< / i \>
113
+ )
114
+ }xi
115
+
116
+ FRAGMENT_OTHER = %r{
117
+ # numbers
118
+ \d+[,.]\d+ |
119
+ \d+ |
120
+ # entities
121
+ #{RX_ENTITY} |
122
+ # math symbols with exception of angle brackets which are part of HTML
123
+ # syntax
124
+ (?![<>]) \p{Sm} |
125
+ # some ASCII characters used as operators which do not belong to Sm
126
+ [-/()]
127
+ }xo
128
+
129
+ GOOD_SCORE_THRESHOLD = 11
130
+ end
131
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require "forwardable"
6
+ require "nokogiri"
7
+
8
+ module HTML2AsciiMath
9
+ class HTMLParser < Nokogiri::HTML::SAX::Parser
10
+ extend Forwardable
11
+
12
+ attr_reader :converter, :string
13
+
14
+ def_delegators :@converter, :push, :open_group, :close_group, :variable_mode=
15
+
16
+ def initialize(str, converter)
17
+ super(SAXCallbacks.new(self))
18
+ @string = str
19
+ @converter = converter
20
+ end
21
+
22
+ def parse
23
+ super(string)
24
+ end
25
+
26
+ private
27
+
28
+ def on_i(opening)
29
+ self.variable_mode = opening
30
+ end
31
+
32
+ def on_sub(opening)
33
+ if opening
34
+ push "_"
35
+ open_group
36
+ else
37
+ close_group
38
+ end
39
+ end
40
+
41
+ def on_sup(opening)
42
+ if opening
43
+ push "^"
44
+ open_group
45
+ else
46
+ close_group
47
+ end
48
+ end
49
+
50
+ # Associates element names with element handlers.
51
+ #
52
+ # Example: <code>{ "some_tag_name" => :on_some_tag_name }</code>
53
+ ELEMENT_HANDLERS = (instance_methods + private_instance_methods)
54
+ .grep(/\Aon_/)
55
+ .map { |h| [h.to_s[3..].freeze, h] }
56
+ .to_h
57
+ .freeze
58
+
59
+ class SAXCallbacks < Nokogiri::XML::SAX::Document
60
+ attr_reader :parser
61
+
62
+ def initialize(parser)
63
+ @parser = parser
64
+ end
65
+
66
+ def characters(text)
67
+ HTMLTextParser.new(text, parser.converter).parse
68
+ true
69
+ end
70
+
71
+ def start_element(elem_name, _attrs = [])
72
+ # TODO maintain some elements stack
73
+ handler = ELEMENT_HANDLERS[elem_name]
74
+ handler && parser.send(handler, true)
75
+ end
76
+
77
+ def end_element(elem_name)
78
+ # TODO auto-close elements which are above this one in elements stack
79
+ handler = ELEMENT_HANDLERS[elem_name]
80
+ handler && parser.send(handler, false)
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require "forwardable"
6
+ require "unicode_scanner"
7
+
8
+ module HTML2AsciiMath
9
+ class HTMLTextParser < UnicodeScanner
10
+ extend Forwardable
11
+
12
+ attr_reader :converter
13
+
14
+ def_delegators :@converter, :push, :open_group, :close_group, :variable_mode
15
+
16
+ def initialize(str, converter)
17
+ super(str)
18
+ @converter = converter
19
+ end
20
+
21
+ def parse # rubocop:disable Metrics/CyclomaticComplexity
22
+ repeat_until_error_or_eos do
23
+ skip_ws or scan_number or scan_text or scan_symbol or scan_error
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def repeat_until_error_or_eos
30
+ catch(:error) do
31
+ yield until eos?
32
+ end
33
+ end
34
+
35
+ def scan_error
36
+ throw :error
37
+ end
38
+
39
+ def skip_ws
40
+ skip(/\s+/)
41
+ end
42
+
43
+ def scan_number
44
+ number = scan(/\d+(?:\.\d+)?/) or return
45
+ push(number)
46
+ true
47
+ end
48
+
49
+ def scan_text
50
+ text = scan(/\p{Letter}+/) or return
51
+ # TODO distinguish variables (which should be left unquoted), regular
52
+ # text (which should be quoted), and textual operators (e.g. sum).
53
+ push(variable_mode ? text : %["#{text}"])
54
+ true
55
+ end
56
+
57
+ def scan_symbol
58
+ # Any character that does not belong to Control category.
59
+ str = scan(/\p{^C}/) or return
60
+ symb = SYMBOLS[str] || str
61
+ push(symb)
62
+ true
63
+ end
64
+
65
+ # Left side is a HTML math symbol recognized by scanner. Right side is its
66
+ # AsciiMath equivalent or nil when no translation is needed.
67
+ # @todo Perhaps brackets should be handled separately.
68
+ SYMBOLS = {
69
+ "-" => nil,
70
+ "+" => nil,
71
+ "×" => "xx",
72
+ "/" => "//",
73
+ "÷" => "-:",
74
+ "\u22c5" => "*", # (dot operator)
75
+ "=" => nil,
76
+ "≤" => "<=",
77
+ "≥" => ">=",
78
+ "≠" => "!=",
79
+ "¬" => "not",
80
+ "∧" => "and",
81
+ "∨" => "or",
82
+ "(" => nil,
83
+ ")" => nil,
84
+ "%" => nil,
85
+ "!" => nil,
86
+ "∃" => "EE",
87
+ "∀" => "AA",
88
+ "∞" => "oo"
89
+ }
90
+ .freeze
91
+
92
+ # A regular expression which matches every symbol defined in +SYMBOLS+ hash.
93
+ SYMBOLS_RX =
94
+ Regexp.new(SYMBOLS.keys.map { |k| Regexp.escape(k) }.join("|")).freeze
95
+ end
96
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ module HTML2AsciiMath
6
+ VERSION = "0.1.0"
7
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require_relative "html2asciimath/version"
6
+ require_relative "html2asciimath/ast"
7
+ require_relative "html2asciimath/converter"
8
+ require_relative "html2asciimath/detector"
9
+ require_relative "html2asciimath/html_parser"
10
+ require_relative "html2asciimath/html_text_parser"
11
+
12
+ module HTML2AsciiMath
13
+ class Error < StandardError; end
14
+
15
+ def convert(input)
16
+ str = input&.strip
17
+ return str if str.nil? || str.empty?
18
+ Converter.new(str).transform
19
+ end
20
+
21
+ def html_replace(input, &block)
22
+ Detector.new(input).replace(&block)
23
+ end
24
+
25
+ module_function :convert, :html_replace
26
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2asciimath
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ribose
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-01-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: unicode_scanner
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ description:
42
+ email:
43
+ - open.source@ribose.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - LICENSE.txt
49
+ - README.md
50
+ - html2asciimath.gemspec
51
+ - lib/html2asciimath.rb
52
+ - lib/html2asciimath/ast.rb
53
+ - lib/html2asciimath/converter.rb
54
+ - lib/html2asciimath/detector.rb
55
+ - lib/html2asciimath/html_parser.rb
56
+ - lib/html2asciimath/html_text_parser.rb
57
+ - lib/html2asciimath/version.rb
58
+ homepage: https://www.plurimath.org/
59
+ licenses:
60
+ - MIT
61
+ metadata:
62
+ homepage_uri: https://www.plurimath.org/
63
+ source_code_uri: https://github.com/plurimath/html2asciimath
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 2.6.0
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubygems_version: 3.3.3
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Converts simple math formulae written in pure HTML to AsciiMath
83
+ test_files: []