html2asciimath 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: cf33dc42507a6cc5e93f188c5350c1917f756b4650ac57b1b1ea35c01d15ffc6
4
+ data.tar.gz: 51b88bae00fc7ec60138969f69dab9931dfb188acebee4f644933934572d02b6
5
+ SHA512:
6
+ metadata.gz: 12b27fae00e9f1cfddecc0c7002396c61ecc0f673653fcae0eb4152faeda091ceb57be153fa992ca6bf7ed8de9ddaadba81c61cba4047f12372f9e54de1b4b0a
7
+ data.tar.gz: 2d25c46e21b49ee2450c61c50b62769ffbc47c8edc3688d6ac169c6b9c785f026f956c94a15ca2ad5390be6d8441368dc4d9116020132978e789bcae97b0a489
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Sebastian Skałacki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # HTML2AsciiMath
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/html2asciimath`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'html2asciimath'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle install
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install html2asciimath
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/html2asciimath.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/html2asciimath/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "html2asciimath"
7
+ spec.version = HTML2AsciiMath::VERSION
8
+ spec.authors = ["Ribose"]
9
+ spec.email = ["open.source@ribose.com"]
10
+ spec.license = "MIT"
11
+
12
+ spec.summary = "Converts simple math formulae written in pure HTML " +
13
+ "to AsciiMath"
14
+
15
+ spec.homepage = "https://www.plurimath.org/"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/plurimath/html2asciimath"
19
+
20
+ all_files_in_git = Dir.chdir(File.expand_path(__dir__)) do
21
+ `git ls-files -z`.split("\x0")
22
+ end
23
+
24
+ # Specify which files should be added to the gem when it is released.
25
+ spec.files = all_files_in_git.select do |f|
26
+ f.start_with?("exe/", "lib/", "README.", "LICENSE.") ||
27
+ f.end_with?(".gemspec")
28
+ end
29
+
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.6.0")
35
+
36
+ spec.add_runtime_dependency "nokogiri"
37
+ spec.add_runtime_dependency "unicode_scanner", "~> 1.0"
38
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ module HTML2AsciiMath
6
+ # Abstract syntax tree implemented as array of arrays and other objects.
7
+ class AST < Array
8
+ module Refinements
9
+ refine Object do
10
+ def to_asciimath(**)
11
+ itself
12
+ end
13
+ end
14
+ end
15
+
16
+ using Refinements
17
+
18
+ def to_asciimath(child: false)
19
+ result = map { |item| item.to_asciimath(child: true) }.join(" ")
20
+ child && size > 1 ? "( #{result} )" : result
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ module HTML2AsciiMath
6
+ # This class is responsible for converting HTML math expressions to AsciiMath.
7
+ #
8
+ # It runs two small parsers: first HTMLparser deals with HTML syntax, and then
9
+ # HTMLTextParser processes textual content found between HTML elements.
10
+ # Thanks to this two-phase processing, HTMLTextParser receives input which is
11
+ # already decoded (i.e. without any HTML entities), which in turn allows to
12
+ # keep its grammar simple.
13
+ #
14
+ # @example
15
+ # html_string = "<i>x</i>+<i>y</i>"
16
+ # Converter.new(html_string).transform # => "x + y"
17
+ class Converter
18
+ attr_reader :ast, :ast_stack, :html_parser
19
+ attr_accessor :variable_mode
20
+
21
+ def initialize(str)
22
+ @html_parser = HTMLParser.new(str, self)
23
+ end
24
+
25
+ def transform
26
+ to_asciimath
27
+ end
28
+
29
+ def open_group
30
+ ast_stack.push AST.new
31
+ end
32
+
33
+ def close_group
34
+ push ast_stack.pop
35
+ end
36
+
37
+ def push(*objs)
38
+ ast_stack.last.push(*objs)
39
+ end
40
+
41
+ def to_asciimath
42
+ parse
43
+ ast.to_asciimath
44
+ end
45
+
46
+ def parse
47
+ return if @ast
48
+
49
+ @ast = AST.new
50
+ @ast_stack = [@ast]
51
+ @variable_mode = false
52
+ html_parser.parse
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require "unicode_scanner"
6
+
7
+ module HTML2AsciiMath
8
+ class Detector < UnicodeScanner
9
+ def initialize(str)
10
+ super(str.dup)
11
+ end
12
+
13
+ def replace(&block)
14
+ scan_for_math do |math_start, math_end, score|
15
+ range = (math_start...math_end)
16
+ source_math = string[range]
17
+ target_math = yield source_math
18
+ string[range] = target_math
19
+ self.pos += (target_math.size - source_math.size)
20
+ end
21
+ string
22
+ end
23
+
24
+ private
25
+
26
+ def scan_for_math
27
+ fast_forward_inline_whitespace
28
+
29
+ until eos? do
30
+ assess_candidate
31
+ yield [@start, @end, @score] if good_score?
32
+ fast_forward_to_next_candidate
33
+ end
34
+ end
35
+
36
+ def assess_candidate
37
+ init_candidate
38
+ nil while match_candidate_fragment
39
+ end
40
+
41
+ def match_candidate_fragment
42
+ fast_forward_inline_whitespace
43
+
44
+ case
45
+ when scan(FRAGMENT_WORD_AND_BRACKETS)
46
+ score_sure
47
+ when scan(FRAGMENT_SUB_OR_SUP)
48
+ score_almost_sure
49
+ when scan(FRAGMENT_B_OR_I)
50
+ single_char_or_entity?(self[:inner]) ? score_sure : score_maybe
51
+ when scan(FRAGMENT_OTHER)
52
+ score_maybe
53
+ end
54
+
55
+ matched?
56
+ end
57
+
58
+ def fast_forward_inline_whitespace
59
+ skip(/\p{Zs}*/)
60
+ end
61
+
62
+ def fast_forward_to_next_candidate
63
+ skip(/(?>\p{L}+|.)[[:space:]]*/m)
64
+ end
65
+
66
+ def init_candidate
67
+ @score = 0
68
+ @start = pos
69
+ end
70
+
71
+ def score_sure
72
+ @score += GOOD_SCORE_THRESHOLD
73
+ @end = pos
74
+ end
75
+
76
+ def score_almost_sure
77
+ @score += GOOD_SCORE_THRESHOLD - 1
78
+ @end = pos
79
+ end
80
+
81
+ def score_maybe
82
+ @score += 1
83
+ @end = pos
84
+ end
85
+
86
+ def good_score?
87
+ @score >= GOOD_SCORE_THRESHOLD
88
+ end
89
+
90
+ def single_char_or_entity?(str)
91
+ str.match? /\A(\S|#{RX_ENTITY})\Z/o
92
+ end
93
+
94
+ # HTML entity
95
+ RX_ENTITY = /&#?\w{,8};/
96
+
97
+ # Word immediately followed with brackets.
98
+ FRAGMENT_WORD_AND_BRACKETS = %r{
99
+ (?> \p{L}+)
100
+ (?: \( .*? \) | \[ .*? \] | \{ .*? \} )
101
+ }x
102
+
103
+ FRAGMENT_SUB_OR_SUP = %r{
104
+ \< sub \> .*? \< / sub \> |
105
+ \< sup \> .*? \< / sup \>
106
+ }xi
107
+
108
+ FRAGMENT_B_OR_I = %r{
109
+ (?<b_or_i>
110
+ (?> \< b \> ) (?: \g<b_or_i> | (?<inner> [^<>\p{Z}\p{C}]*?)) \< / b \>
111
+ |
112
+ (?> \< i \> ) (?: \g<b_or_i> | (?<inner> [^<>\p{Z}\p{C}]*?)) \< / i \>
113
+ )
114
+ }xi
115
+
116
+ FRAGMENT_OTHER = %r{
117
+ # numbers
118
+ \d+[,.]\d+ |
119
+ \d+ |
120
+ # entities
121
+ #{RX_ENTITY} |
122
+ # math symbols with exception of angle brackets which are part of HTML
123
+ # syntax
124
+ (?![<>]) \p{Sm} |
125
+ # some ASCII characters used as operators which do not belong to Sm
126
+ [-/()]
127
+ }xo
128
+
129
+ GOOD_SCORE_THRESHOLD = 11
130
+ end
131
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require "forwardable"
6
+ require "nokogiri"
7
+
8
+ module HTML2AsciiMath
9
+ class HTMLParser < Nokogiri::HTML::SAX::Parser
10
+ extend Forwardable
11
+
12
+ attr_reader :converter, :string
13
+
14
+ def_delegators :@converter, :push, :open_group, :close_group, :variable_mode=
15
+
16
+ def initialize(str, converter)
17
+ super(SAXCallbacks.new(self))
18
+ @string = str
19
+ @converter = converter
20
+ end
21
+
22
+ def parse
23
+ super(string)
24
+ end
25
+
26
+ private
27
+
28
+ def on_i(opening)
29
+ self.variable_mode = opening
30
+ end
31
+
32
+ def on_sub(opening)
33
+ if opening
34
+ push "_"
35
+ open_group
36
+ else
37
+ close_group
38
+ end
39
+ end
40
+
41
+ def on_sup(opening)
42
+ if opening
43
+ push "^"
44
+ open_group
45
+ else
46
+ close_group
47
+ end
48
+ end
49
+
50
+ # Associates element names with element handlers.
51
+ #
52
+ # Example: <code>{ "some_tag_name" => :on_some_tag_name }</code>
53
+ ELEMENT_HANDLERS = (instance_methods + private_instance_methods)
54
+ .grep(/\Aon_/)
55
+ .map { |h| [h.to_s[3..].freeze, h] }
56
+ .to_h
57
+ .freeze
58
+
59
+ class SAXCallbacks < Nokogiri::XML::SAX::Document
60
+ attr_reader :parser
61
+
62
+ def initialize(parser)
63
+ @parser = parser
64
+ end
65
+
66
+ def characters(text)
67
+ HTMLTextParser.new(text, parser.converter).parse
68
+ true
69
+ end
70
+
71
+ def start_element(elem_name, _attrs = [])
72
+ # TODO maintain some elements stack
73
+ handler = ELEMENT_HANDLERS[elem_name]
74
+ handler && parser.send(handler, true)
75
+ end
76
+
77
+ def end_element(elem_name)
78
+ # TODO auto-close elements which are above this one in elements stack
79
+ handler = ELEMENT_HANDLERS[elem_name]
80
+ handler && parser.send(handler, false)
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require "forwardable"
6
+ require "unicode_scanner"
7
+
8
+ module HTML2AsciiMath
9
+ class HTMLTextParser < UnicodeScanner
10
+ extend Forwardable
11
+
12
+ attr_reader :converter
13
+
14
+ def_delegators :@converter, :push, :open_group, :close_group, :variable_mode
15
+
16
+ def initialize(str, converter)
17
+ super(str)
18
+ @converter = converter
19
+ end
20
+
21
+ def parse # rubocop:disable Metrics/CyclomaticComplexity
22
+ repeat_until_error_or_eos do
23
+ skip_ws or scan_number or scan_text or scan_symbol or scan_error
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def repeat_until_error_or_eos
30
+ catch(:error) do
31
+ yield until eos?
32
+ end
33
+ end
34
+
35
+ def scan_error
36
+ throw :error
37
+ end
38
+
39
+ def skip_ws
40
+ skip(/\s+/)
41
+ end
42
+
43
+ def scan_number
44
+ number = scan(/\d+(?:\.\d+)?/) or return
45
+ push(number)
46
+ true
47
+ end
48
+
49
+ def scan_text
50
+ text = scan(/\p{Letter}+/) or return
51
+ # TODO distinguish variables (which should be left unquoted), regular
52
+ # text (which should be quoted), and textual operators (e.g. sum).
53
+ push(variable_mode ? text : %["#{text}"])
54
+ true
55
+ end
56
+
57
+ def scan_symbol
58
+ # Any character that does not belong to Control category.
59
+ str = scan(/\p{^C}/) or return
60
+ symb = SYMBOLS[str] || str
61
+ push(symb)
62
+ true
63
+ end
64
+
65
+ # Left side is a HTML math symbol recognized by scanner. Right side is its
66
+ # AsciiMath equivalent or nil when no translation is needed.
67
+ # @todo Perhaps brackets should be handled separately.
68
+ SYMBOLS = {
69
+ "-" => nil,
70
+ "+" => nil,
71
+ "×" => "xx",
72
+ "/" => "//",
73
+ "÷" => "-:",
74
+ "\u22c5" => "*", # (dot operator)
75
+ "=" => nil,
76
+ "≤" => "<=",
77
+ "≥" => ">=",
78
+ "≠" => "!=",
79
+ "¬" => "not",
80
+ "∧" => "and",
81
+ "∨" => "or",
82
+ "(" => nil,
83
+ ")" => nil,
84
+ "%" => nil,
85
+ "!" => nil,
86
+ "∃" => "EE",
87
+ "∀" => "AA",
88
+ "∞" => "oo"
89
+ }
90
+ .freeze
91
+
92
+ # A regular expression which matches every symbol defined in +SYMBOLS+ hash.
93
+ SYMBOLS_RX =
94
+ Regexp.new(SYMBOLS.keys.map { |k| Regexp.escape(k) }.join("|")).freeze
95
+ end
96
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ module HTML2AsciiMath
6
+ VERSION = "0.1.0"
7
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) 2021 Ribose Inc.
4
+
5
+ require_relative "html2asciimath/version"
6
+ require_relative "html2asciimath/ast"
7
+ require_relative "html2asciimath/converter"
8
+ require_relative "html2asciimath/detector"
9
+ require_relative "html2asciimath/html_parser"
10
+ require_relative "html2asciimath/html_text_parser"
11
+
12
+ module HTML2AsciiMath
13
+ class Error < StandardError; end
14
+
15
+ def convert(input)
16
+ str = input&.strip
17
+ return str if str.nil? || str.empty?
18
+ Converter.new(str).transform
19
+ end
20
+
21
+ def html_replace(input, &block)
22
+ Detector.new(input).replace(&block)
23
+ end
24
+
25
+ module_function :convert, :html_replace
26
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2asciimath
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ribose
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-01-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: unicode_scanner
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ description:
42
+ email:
43
+ - open.source@ribose.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - LICENSE.txt
49
+ - README.md
50
+ - html2asciimath.gemspec
51
+ - lib/html2asciimath.rb
52
+ - lib/html2asciimath/ast.rb
53
+ - lib/html2asciimath/converter.rb
54
+ - lib/html2asciimath/detector.rb
55
+ - lib/html2asciimath/html_parser.rb
56
+ - lib/html2asciimath/html_text_parser.rb
57
+ - lib/html2asciimath/version.rb
58
+ homepage: https://www.plurimath.org/
59
+ licenses:
60
+ - MIT
61
+ metadata:
62
+ homepage_uri: https://www.plurimath.org/
63
+ source_code_uri: https://github.com/plurimath/html2asciimath
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 2.6.0
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubygems_version: 3.3.3
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Converts simple math formulae written in pure HTML to AsciiMath
83
+ test_files: []