latex-decode 0.0.12-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +10 -0
- data/LICENSE +621 -0
- data/README.md +63 -0
- data/features/brackets.feature +11 -0
- data/features/diacritics.feature +40 -0
- data/features/non-latex.feature +15 -0
- data/features/punctuation.feature +46 -0
- data/features/special_characters.feature +20 -0
- data/features/step_definitions/latex.rb +7 -0
- data/features/support/env.rb +1 -0
- data/features/umlauts.feature +11 -0
- data/latex-decode.gemspec +39 -0
- data/lib/latex/decode.rb +49 -0
- data/lib/latex/decode/accents.rb +36 -0
- data/lib/latex/decode/base.rb +60 -0
- data/lib/latex/decode/compatibility.rb +59 -0
- data/lib/latex/decode/diacritics.rb +46 -0
- data/lib/latex/decode/punctuation.rb +57 -0
- data/lib/latex/decode/symbols.rb +223 -0
- data/lib/latex/decode/version.rb +5 -0
- metadata +143 -0
data/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
LaTeX::Decode
|
2
|
+
=============
|
3
|
+
|
4
|
+
LaTeX::Decode is a Ruby gem to convert LaTeX input to Unicode. Its original
|
5
|
+
use was as an input filter for [BibTeX-Ruby](http://rubygems.org/gems/bibtex-ruby)
|
6
|
+
but can be used independently to decode LaTeX. Many of the patterns used by
|
7
|
+
this Ruby gem are based on François Charette's equivalent Perl module
|
8
|
+
[LaTeX::Decode](https://github.com/fc7/LaTeX-Decode).
|
9
|
+
|
10
|
+
Quickstart
|
11
|
+
----------
|
12
|
+
|
13
|
+
$ [sudo] gem install bibtex-ruby
|
14
|
+
$ irb
|
15
|
+
>> require 'latex/decode'
|
16
|
+
>> LaTeX.decode "dipl\\^{o}me d'\\'{e}tudes sup\\'erieures"
|
17
|
+
=> "diplôme d'études supérieures"
|
18
|
+
|
19
|
+
Compatibility
|
20
|
+
-------------
|
21
|
+
|
22
|
+
Unicode handling is one of the major differences between Ruby 1.8 and newer
|
23
|
+
version; LaTeX::Decode; nevertheless, we try to support 1.8 as best as possible.
|
24
|
+
|
25
|
+
Issues
|
26
|
+
------
|
27
|
+
|
28
|
+
Please use the tracker of the project's
|
29
|
+
[Github repository](https://github.com/inukshuk/latex-decode) to report any
|
30
|
+
issues. When describing intended behaviour, please use the extremely simple
|
31
|
+
syntax of the Cucumber features used by LaTeX::Decode; for instance, you could
|
32
|
+
describe the example above as:
|
33
|
+
|
34
|
+
Feature: Decode LaTeX accents
|
35
|
+
As a hacker who works with LaTeX
|
36
|
+
I want to be able to decode LaTeX accents
|
37
|
+
|
38
|
+
Scenario: A French sentence
|
39
|
+
When I decode the string "dipl\\^{o}me d'\\'{e}tudes sup\\'erieures"
|
40
|
+
Then the result should be "diplôme d'études supérieures"
|
41
|
+
|
42
|
+
Credits
|
43
|
+
-------
|
44
|
+
|
45
|
+
Kudos and thanks to all [contributors](https://github.com/inukshuk/latex-decode/contributors)
|
46
|
+
who have made LaTeX::Decode possible!
|
47
|
+
|
48
|
+
Copyright (C) 2011 [Sylvester Keil](sylvester.keil.or.at)
|
49
|
+
|
50
|
+
Copyright (C) 2010 François Charette
|
51
|
+
|
52
|
+
This program is free software: you can redistribute it and/or modify
|
53
|
+
it under the terms of the GNU General Public License as published by
|
54
|
+
the Free Software Foundation, either version 3 of the License, or
|
55
|
+
(at your option) any later version.
|
56
|
+
|
57
|
+
This program is distributed in the hope that it will be useful,
|
58
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
59
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
60
|
+
GNU General Public License for more details.
|
61
|
+
|
62
|
+
You should have received a copy of the GNU General Public License
|
63
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
Feature: Decode LaTeX umlauts
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to remove brackets around single characters
|
4
|
+
|
5
|
+
Scenario: Single character in curly brackets
|
6
|
+
When I decode the string '{a}'
|
7
|
+
Then the result should be 'a'
|
8
|
+
|
9
|
+
Scenario: German umlauts in curly brackets
|
10
|
+
When I decode the string '{\"A}{\"o}{\"u}'
|
11
|
+
Then the result should be 'Äöü'
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: Decode LaTeX diacritics
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode LaTeX diacritics
|
4
|
+
|
5
|
+
Scenario Outline: LaTeX to Unicode transformation
|
6
|
+
When I decode the string '<latex>'
|
7
|
+
Then the result should be '<unicode>'
|
8
|
+
|
9
|
+
Scenarios: Diacritics
|
10
|
+
| latex | unicode | description |
|
11
|
+
| \\\`{o} | ò | grave accent |
|
12
|
+
| \\\'{o} | ó | acute accent |
|
13
|
+
| \\^{o} | ô | circumflex |
|
14
|
+
| \\"{o} | ö | umlaut or dieresis |
|
15
|
+
| \\H{o} | ő | long Hungarian umlaut (double acute) |
|
16
|
+
| \\~{o} | õ | tilde |
|
17
|
+
| \\c{c} | ç | cedilla |
|
18
|
+
| \\c c | ç | |
|
19
|
+
| \\c cb | çb | |
|
20
|
+
| \\c {cb}| \\c cb | |
|
21
|
+
| \\c C | Ç | |
|
22
|
+
| {\\c c} | ç | |
|
23
|
+
| \\k{a} | ą | ogonek |
|
24
|
+
| \\l | ł | l with stroke |
|
25
|
+
| \\L | Ł | l with stroke |
|
26
|
+
| \\={o} | ō | macron accent (a bar over the letter) |
|
27
|
+
| \\b{o} | o̱ | bar under the letter |
|
28
|
+
| \\.{o} | ȯ | dot over the letter |
|
29
|
+
| \\d{u} | ụ | dot under the letter |
|
30
|
+
| \\r{a} | å | ring over the letter |
|
31
|
+
| \\u{o} | ŏ | breve over the letter |
|
32
|
+
| \\v{s} | š | caron/hacek ("v") over the letter |
|
33
|
+
| \\t{oo} | o͡o | "tie" (inverted u) over the two letters |
|
34
|
+
| \\aa | å | ring over the letter a |
|
35
|
+
| \\AA | Å | ring over the letter A |
|
36
|
+
| \\o | ø | slashed o |
|
37
|
+
| \\O | Ø | slashed O |
|
38
|
+
| \\ae | æ | ae |
|
39
|
+
| \\AE | Æ | AE |
|
40
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Feature: Keep non-LaTeX markup as it is
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be preserve non-LaTeX text as it is
|
4
|
+
|
5
|
+
Scenario: Standalone escape characters
|
6
|
+
When I decode the string '\\'
|
7
|
+
Then the result should be '\\'
|
8
|
+
|
9
|
+
Scenario: Common markup in Regular Expressions
|
10
|
+
When I decode the string '.*'
|
11
|
+
Then the result should be '.*'
|
12
|
+
When I decode the string '^x$'
|
13
|
+
Then the result should be '^x$'
|
14
|
+
When I decode the string '\\\\2'
|
15
|
+
Then the result should be '\\\\2'
|
@@ -0,0 +1,46 @@
|
|
1
|
+
Feature: Decode LaTeX punctuation directives
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode LaTeX punctuation marks
|
4
|
+
|
5
|
+
Scenario Outline: LaTeX to Unicode transformation
|
6
|
+
When I decode the string '<latex>'
|
7
|
+
Then the result should be '<unicode>'
|
8
|
+
|
9
|
+
Scenarios: Punctuation macros
|
10
|
+
| latex | unicode |
|
11
|
+
| \\textendash | – |
|
12
|
+
| \\textemdash | — |
|
13
|
+
| \\textquoteleft | ‘ |
|
14
|
+
| \\textquoteright | ’ |
|
15
|
+
| \\quotesinglbase | ‚ |
|
16
|
+
| \\textquotedblleft | “ |
|
17
|
+
| \\textquotedblright | ” |
|
18
|
+
| \\quotedblbase | „ |
|
19
|
+
| \\dag | † |
|
20
|
+
| \\ddag | ‡ |
|
21
|
+
| \\textbullet | • |
|
22
|
+
| \\dots | … |
|
23
|
+
| \\textperthousand | ‰ |
|
24
|
+
| \\textpertenthousand | ‱ |
|
25
|
+
| \\guilsinglleft | ‹ |
|
26
|
+
| \\guilsinglright | › |
|
27
|
+
| \\textreferencemark | ※ |
|
28
|
+
| \\textinterrobang | ‽ |
|
29
|
+
| \\textoverline | ‾ |
|
30
|
+
| \\langle | ⟨ |
|
31
|
+
| \\rangle | ⟩ |
|
32
|
+
|
33
|
+
|
34
|
+
Scenarios: Punctuation symbols
|
35
|
+
| latex | unicode | description |
|
36
|
+
| - | - | hyphen |
|
37
|
+
| -- | – | en-dash |
|
38
|
+
| --- | — | em-dash |
|
39
|
+
| \\~{} | ~ | tilde |
|
40
|
+
|
41
|
+
Scenarios: Quotation marks
|
42
|
+
| latex | unicode | description |
|
43
|
+
| `` | “ | left double quotes |
|
44
|
+
| '' | ” | right double quotes |
|
45
|
+
| ` | ‘ | left single quotes |
|
46
|
+
| ' | ’ | right single quotes |
|
@@ -0,0 +1,20 @@
|
|
1
|
+
Feature: Decode LaTeX special characters
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode a few special characters which are escaped by LaTeX
|
4
|
+
|
5
|
+
Scenario Outline: LaTeX to Unicode transformation
|
6
|
+
When I decode the string '<latex>'
|
7
|
+
Then the result should be '<unicode>'
|
8
|
+
|
9
|
+
Scenarios: Special characters
|
10
|
+
| latex | unicode |
|
11
|
+
| \\\& | & |
|
12
|
+
| \\# | # |
|
13
|
+
| \\$ | $ |
|
14
|
+
| \\% | % |
|
15
|
+
| \\{ | { |
|
16
|
+
| \\} | } |
|
17
|
+
| \\_ | _ |
|
18
|
+
| \\textasciitilde{} | ~ |
|
19
|
+
| \\textbackslash{} | \\ |
|
20
|
+
| \\textasciicircum{} | ^ |
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'latex/decode'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
Feature: Decode LaTeX umlauts
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode LaTeX umlauts
|
4
|
+
|
5
|
+
Scenario: Lowercase german umlauts
|
6
|
+
When I decode the string '\"a\"o\"u'
|
7
|
+
Then the result should be 'äöü'
|
8
|
+
|
9
|
+
Scenario: Uppercase german umlauts
|
10
|
+
When I decode the string '\"A\"O\"U'
|
11
|
+
Then the result should be 'ÄÖÜ'
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib/', __FILE__)
|
3
|
+
$:.unshift lib unless $:.include?(lib)
|
4
|
+
|
5
|
+
require 'latex/decode/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'latex-decode'
|
9
|
+
s.version = LaTeX::Decode::VERSION.dup
|
10
|
+
s.authors = ['Sylvester Keil']
|
11
|
+
s.email = ['http://sylvester.keil.or.at']
|
12
|
+
s.homepage = 'http://github.com/inukshuk/latex-decode'
|
13
|
+
s.summary = 'Decodes LaTeX to Unicode.'
|
14
|
+
s.description = 'Decodes strings formatted in LaTeX to equivalent Unicode strings.'
|
15
|
+
s.license = 'GPL-3'
|
16
|
+
|
17
|
+
if RUBY_PLATFORM =~ /java/
|
18
|
+
s.platform = 'java'
|
19
|
+
else
|
20
|
+
s.add_dependency('unicode', '~> 0.4')
|
21
|
+
s.platform = 'ruby'
|
22
|
+
end
|
23
|
+
|
24
|
+
s.add_development_dependency('rake', '~> 0.8')
|
25
|
+
s.add_development_dependency('bundler', '~> 1.0')
|
26
|
+
s.add_development_dependency('rdoc', '~> 3.6')
|
27
|
+
s.add_development_dependency('rspec', '~> 2.6')
|
28
|
+
s.add_development_dependency('cucumber', "~> 1.0")
|
29
|
+
|
30
|
+
s.files = `git ls-files`.split("\n")
|
31
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
32
|
+
s.executables = []
|
33
|
+
s.require_path = 'lib'
|
34
|
+
|
35
|
+
s.rdoc_options = %w{--line-numbers --inline-source --title "LaTeX-Decode\ Documentation" --main README.md --webcvs=http://github.com/inukshuk/latex-decode/tree/master/}
|
36
|
+
s.extra_rdoc_files = %w{README.md LICENSE}
|
37
|
+
end
|
38
|
+
|
39
|
+
# vim: syntax=ruby
|
data/lib/latex/decode.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
# LaTeX::Decode
|
3
|
+
# Copyright (C) 2011 Sylvester Keil <sylvester.keil.or.at>
|
4
|
+
# Copyright (C) 2010 François Charette
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require 'latex/decode/version'
|
21
|
+
require 'latex/decode/compatibility'
|
22
|
+
require 'latex/decode/base'
|
23
|
+
|
24
|
+
require 'latex/decode/accents'
|
25
|
+
require 'latex/decode/diacritics'
|
26
|
+
require 'latex/decode/punctuation'
|
27
|
+
require 'latex/decode/symbols'
|
28
|
+
|
29
|
+
module LaTeX
|
30
|
+
|
31
|
+
class << self
|
32
|
+
def decode (string)
|
33
|
+
return string unless string.respond_to?(:to_s)
|
34
|
+
|
35
|
+
string = string.is_a?(String) ? string.dup : string.to_s
|
36
|
+
|
37
|
+
Decode::Base.normalize(string)
|
38
|
+
|
39
|
+
Decode::Accents.decode!(string)
|
40
|
+
Decode::Diacritics.decode!(string)
|
41
|
+
Decode::Punctuation.decode!(string)
|
42
|
+
Decode::Symbols.decode!(string)
|
43
|
+
|
44
|
+
Decode::Base.strip_braces(string)
|
45
|
+
|
46
|
+
LaTeX.normalize_C(string)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module LaTeX
|
4
|
+
module Decode
|
5
|
+
|
6
|
+
class Accents < Decoder
|
7
|
+
@map = Hash[*%W{
|
8
|
+
` \u0300
|
9
|
+
' \u0301
|
10
|
+
^ \u0302
|
11
|
+
~ \u0303
|
12
|
+
= \u0304
|
13
|
+
. \u0307
|
14
|
+
'' \u0308
|
15
|
+
" \u0308
|
16
|
+
}.map { |s| LaTeX.to_unicode(s) }].freeze
|
17
|
+
|
18
|
+
@patterns = [
|
19
|
+
ruby_18 {
|
20
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })\{([[:alpha:]]+)\}/ou
|
21
|
+
} ||
|
22
|
+
ruby_19 {
|
23
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })\{(\p{L}\p{M}*)\}/ou
|
24
|
+
},
|
25
|
+
ruby_18 {
|
26
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })([[:alpha:]])/ou
|
27
|
+
} ||
|
28
|
+
ruby_19 {
|
29
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })(\p{L}\p{M}*)/ou
|
30
|
+
}
|
31
|
+
].freeze
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module LaTeX
|
4
|
+
module Decode
|
5
|
+
|
6
|
+
class Decoder
|
7
|
+
class << self
|
8
|
+
attr_reader :patterns, :map
|
9
|
+
|
10
|
+
def inherited (base)
|
11
|
+
subclasses << base
|
12
|
+
end
|
13
|
+
|
14
|
+
def subclasses
|
15
|
+
@subclasses ||= []
|
16
|
+
end
|
17
|
+
|
18
|
+
def decode (string)
|
19
|
+
decode!(string.dup)
|
20
|
+
end
|
21
|
+
|
22
|
+
def decode! (string)
|
23
|
+
puts name unless patterns
|
24
|
+
patterns.each do |pattern|
|
25
|
+
string.gsub!(pattern) { |m| [$2,map[$1],$3].compact.join }
|
26
|
+
end
|
27
|
+
string
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module Base
|
33
|
+
|
34
|
+
module_function
|
35
|
+
|
36
|
+
def normalize (string)
|
37
|
+
string.gsub!(/\\(?:i|j)\b/) { |m| m == '\\i' ? 'ı' : 'ȷ' }
|
38
|
+
|
39
|
+
# \foo\ bar -> \foo{} bar
|
40
|
+
string.gsub!(/(\\[a-zA-Z]+)\\(\s+)/, '\1{}\2')
|
41
|
+
|
42
|
+
# Aaaa\o, -> Aaaa\o{},
|
43
|
+
string.gsub!(/([^{]\\\w)([;,.:%])/, '\1{}\2')
|
44
|
+
|
45
|
+
# \c cb -> \c{cb}
|
46
|
+
string.gsub!(/(\\[^\sij&#\$\{\}_~%])\s+([[:alpha:]]+)\b/i, '\1{\2}')
|
47
|
+
|
48
|
+
string
|
49
|
+
end
|
50
|
+
|
51
|
+
def strip_braces (string)
|
52
|
+
string.gsub!(/(^|[^\\])([\{\}]+)/, '\1')
|
53
|
+
string.gsub!(/\\(\{|\})/, '\1')
|
54
|
+
string
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
if RUBY_VERSION < "1.9"
|
3
|
+
$KCODE = 'U'
|
4
|
+
|
5
|
+
module LaTeX
|
6
|
+
def self.to_unicode (string)
|
7
|
+
string.gsub(/\\?u([\da-f]{4})/i) { |m| [$1.to_i(16)].pack('U') }
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def ruby_18; yield; end
|
12
|
+
def ruby_19; false; end
|
13
|
+
else
|
14
|
+
|
15
|
+
module LaTeX
|
16
|
+
def self.to_unicode (string); string; end
|
17
|
+
end
|
18
|
+
|
19
|
+
def ruby_18; false; end
|
20
|
+
def ruby_19; yield; end
|
21
|
+
end
|
22
|
+
|
23
|
+
if RUBY_PLATFORM == 'java'
|
24
|
+
require 'java'
|
25
|
+
|
26
|
+
# Use the Java native Unicode normalizer
|
27
|
+
module LaTeX
|
28
|
+
def self.normalize_C(string)
|
29
|
+
java.text.Normalizer.normalize(string, java.text.Normalizer::Form::NFC).to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
else
|
34
|
+
|
35
|
+
begin
|
36
|
+
require 'unicode'
|
37
|
+
|
38
|
+
# Use the Unicode gem
|
39
|
+
module LaTeX
|
40
|
+
def self.normalize_C(string)
|
41
|
+
Unicode::normalize_C(string)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
rescue LoadError
|
45
|
+
begin
|
46
|
+
require 'active_support/multibyte/chars'
|
47
|
+
|
48
|
+
# Use ActiveSupport's normalizer
|
49
|
+
module LaTeX
|
50
|
+
def self.normalize_C(string)
|
51
|
+
ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
|
52
|
+
end
|
53
|
+
end
|
54
|
+
rescue LoadError
|
55
|
+
fail "Failed to load unicode normalizer: please gem install unicode (or active_support)"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|