latex-decode 0.0.12-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +10 -0
- data/LICENSE +621 -0
- data/README.md +63 -0
- data/features/brackets.feature +11 -0
- data/features/diacritics.feature +40 -0
- data/features/non-latex.feature +15 -0
- data/features/punctuation.feature +46 -0
- data/features/special_characters.feature +20 -0
- data/features/step_definitions/latex.rb +7 -0
- data/features/support/env.rb +1 -0
- data/features/umlauts.feature +11 -0
- data/latex-decode.gemspec +39 -0
- data/lib/latex/decode.rb +49 -0
- data/lib/latex/decode/accents.rb +36 -0
- data/lib/latex/decode/base.rb +60 -0
- data/lib/latex/decode/compatibility.rb +59 -0
- data/lib/latex/decode/diacritics.rb +46 -0
- data/lib/latex/decode/punctuation.rb +57 -0
- data/lib/latex/decode/symbols.rb +223 -0
- data/lib/latex/decode/version.rb +5 -0
- metadata +143 -0
data/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
LaTeX::Decode
|
2
|
+
=============
|
3
|
+
|
4
|
+
LaTeX::Decode is a Ruby gem to convert LaTeX input to Unicode. Its original
|
5
|
+
use was as an input filter for [BibTeX-Ruby](http://rubygems.org/gems/bibtex-ruby)
|
6
|
+
but can be used independently to decode LaTeX. Many of the patterns used by
|
7
|
+
this Ruby gem are based on François Charette's equivalent Perl module
|
8
|
+
[LaTeX::Decode](https://github.com/fc7/LaTeX-Decode).
|
9
|
+
|
10
|
+
Quickstart
|
11
|
+
----------
|
12
|
+
|
13
|
+
$ [sudo] gem install bibtex-ruby
|
14
|
+
$ irb
|
15
|
+
>> require 'latex/decode'
|
16
|
+
>> LaTeX.decode "dipl\\^{o}me d'\\'{e}tudes sup\\'erieures"
|
17
|
+
=> "diplôme d'études supérieures"
|
18
|
+
|
19
|
+
Compatibility
|
20
|
+
-------------
|
21
|
+
|
22
|
+
Unicode handling is one of the major differences between Ruby 1.8 and newer
|
23
|
+
version; LaTeX::Decode; nevertheless, we try to support 1.8 as best as possible.
|
24
|
+
|
25
|
+
Issues
|
26
|
+
------
|
27
|
+
|
28
|
+
Please use the tracker of the project's
|
29
|
+
[Github repository](https://github.com/inukshuk/latex-decode) to report any
|
30
|
+
issues. When describing intended behaviour, please use the extremely simple
|
31
|
+
syntax of the Cucumber features used by LaTeX::Decode; for instance, you could
|
32
|
+
describe the example above as:
|
33
|
+
|
34
|
+
Feature: Decode LaTeX accents
|
35
|
+
As a hacker who works with LaTeX
|
36
|
+
I want to be able to decode LaTeX accents
|
37
|
+
|
38
|
+
Scenario: A French sentence
|
39
|
+
When I decode the string "dipl\\^{o}me d'\\'{e}tudes sup\\'erieures"
|
40
|
+
Then the result should be "diplôme d'études supérieures"
|
41
|
+
|
42
|
+
Credits
|
43
|
+
-------
|
44
|
+
|
45
|
+
Kudos and thanks to all [contributors](https://github.com/inukshuk/latex-decode/contributors)
|
46
|
+
who have made LaTeX::Decode possible!
|
47
|
+
|
48
|
+
Copyright (C) 2011 [Sylvester Keil](sylvester.keil.or.at)
|
49
|
+
|
50
|
+
Copyright (C) 2010 François Charette
|
51
|
+
|
52
|
+
This program is free software: you can redistribute it and/or modify
|
53
|
+
it under the terms of the GNU General Public License as published by
|
54
|
+
the Free Software Foundation, either version 3 of the License, or
|
55
|
+
(at your option) any later version.
|
56
|
+
|
57
|
+
This program is distributed in the hope that it will be useful,
|
58
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
59
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
60
|
+
GNU General Public License for more details.
|
61
|
+
|
62
|
+
You should have received a copy of the GNU General Public License
|
63
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
Feature: Decode LaTeX umlauts
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to remove brackets around single characters
|
4
|
+
|
5
|
+
Scenario: Single character in curly brackets
|
6
|
+
When I decode the string '{a}'
|
7
|
+
Then the result should be 'a'
|
8
|
+
|
9
|
+
Scenario: German umlauts in curly brackets
|
10
|
+
When I decode the string '{\"A}{\"o}{\"u}'
|
11
|
+
Then the result should be 'Äöü'
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: Decode LaTeX diacritics
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode LaTeX diacritics
|
4
|
+
|
5
|
+
Scenario Outline: LaTeX to Unicode transformation
|
6
|
+
When I decode the string '<latex>'
|
7
|
+
Then the result should be '<unicode>'
|
8
|
+
|
9
|
+
Scenarios: Diacritics
|
10
|
+
| latex | unicode | description |
|
11
|
+
| \\\`{o} | ò | grave accent |
|
12
|
+
| \\\'{o} | ó | acute accent |
|
13
|
+
| \\^{o} | ô | circumflex |
|
14
|
+
| \\"{o} | ö | umlaut or dieresis |
|
15
|
+
| \\H{o} | ő | long Hungarian umlaut (double acute) |
|
16
|
+
| \\~{o} | õ | tilde |
|
17
|
+
| \\c{c} | ç | cedilla |
|
18
|
+
| \\c c | ç | |
|
19
|
+
| \\c cb | çb | |
|
20
|
+
| \\c {cb}| \\c cb | |
|
21
|
+
| \\c C | Ç | |
|
22
|
+
| {\\c c} | ç | |
|
23
|
+
| \\k{a} | ą | ogonek |
|
24
|
+
| \\l | ł | l with stroke |
|
25
|
+
| \\L | Ł | l with stroke |
|
26
|
+
| \\={o} | ō | macron accent (a bar over the letter) |
|
27
|
+
| \\b{o} | o̱ | bar under the letter |
|
28
|
+
| \\.{o} | ȯ | dot over the letter |
|
29
|
+
| \\d{u} | ụ | dot under the letter |
|
30
|
+
| \\r{a} | å | ring over the letter |
|
31
|
+
| \\u{o} | ŏ | breve over the letter |
|
32
|
+
| \\v{s} | š | caron/hacek ("v") over the letter |
|
33
|
+
| \\t{oo} | o͡o | "tie" (inverted u) over the two letters |
|
34
|
+
| \\aa | å | ring over the letter a |
|
35
|
+
| \\AA | Å | ring over the letter A |
|
36
|
+
| \\o | ø | slashed o |
|
37
|
+
| \\O | Ø | slashed O |
|
38
|
+
| \\ae | æ | ae |
|
39
|
+
| \\AE | Æ | AE |
|
40
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Feature: Keep non-LaTeX markup as it is
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be preserve non-LaTeX text as it is
|
4
|
+
|
5
|
+
Scenario: Standalone escape characters
|
6
|
+
When I decode the string '\\'
|
7
|
+
Then the result should be '\\'
|
8
|
+
|
9
|
+
Scenario: Common markup in Regular Expressions
|
10
|
+
When I decode the string '.*'
|
11
|
+
Then the result should be '.*'
|
12
|
+
When I decode the string '^x$'
|
13
|
+
Then the result should be '^x$'
|
14
|
+
When I decode the string '\\\\2'
|
15
|
+
Then the result should be '\\\\2'
|
@@ -0,0 +1,46 @@
|
|
1
|
+
Feature: Decode LaTeX punctuation directives
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode LaTeX punctuation marks
|
4
|
+
|
5
|
+
Scenario Outline: LaTeX to Unicode transformation
|
6
|
+
When I decode the string '<latex>'
|
7
|
+
Then the result should be '<unicode>'
|
8
|
+
|
9
|
+
Scenarios: Punctuation macros
|
10
|
+
| latex | unicode |
|
11
|
+
| \\textendash | – |
|
12
|
+
| \\textemdash | — |
|
13
|
+
| \\textquoteleft | ‘ |
|
14
|
+
| \\textquoteright | ’ |
|
15
|
+
| \\quotesinglbase | ‚ |
|
16
|
+
| \\textquotedblleft | “ |
|
17
|
+
| \\textquotedblright | ” |
|
18
|
+
| \\quotedblbase | „ |
|
19
|
+
| \\dag | † |
|
20
|
+
| \\ddag | ‡ |
|
21
|
+
| \\textbullet | • |
|
22
|
+
| \\dots | … |
|
23
|
+
| \\textperthousand | ‰ |
|
24
|
+
| \\textpertenthousand | ‱ |
|
25
|
+
| \\guilsinglleft | ‹ |
|
26
|
+
| \\guilsinglright | › |
|
27
|
+
| \\textreferencemark | ※ |
|
28
|
+
| \\textinterrobang | ‽ |
|
29
|
+
| \\textoverline | ‾ |
|
30
|
+
| \\langle | ⟨ |
|
31
|
+
| \\rangle | ⟩ |
|
32
|
+
|
33
|
+
|
34
|
+
Scenarios: Punctuation symbols
|
35
|
+
| latex | unicode | description |
|
36
|
+
| - | - | hyphen |
|
37
|
+
| -- | – | en-dash |
|
38
|
+
| --- | — | em-dash |
|
39
|
+
| \\~{} | ~ | tilde |
|
40
|
+
|
41
|
+
Scenarios: Quotation marks
|
42
|
+
| latex | unicode | description |
|
43
|
+
| `` | “ | left double quotes |
|
44
|
+
| '' | ” | right double quotes |
|
45
|
+
| ` | ‘ | left single quotes |
|
46
|
+
| ' | ’ | right single quotes |
|
@@ -0,0 +1,20 @@
|
|
1
|
+
Feature: Decode LaTeX special characters
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode a few special characters which are escaped by LaTeX
|
4
|
+
|
5
|
+
Scenario Outline: LaTeX to Unicode transformation
|
6
|
+
When I decode the string '<latex>'
|
7
|
+
Then the result should be '<unicode>'
|
8
|
+
|
9
|
+
Scenarios: Special characters
|
10
|
+
| latex | unicode |
|
11
|
+
| \\\& | & |
|
12
|
+
| \\# | # |
|
13
|
+
| \\$ | $ |
|
14
|
+
| \\% | % |
|
15
|
+
| \\{ | { |
|
16
|
+
| \\} | } |
|
17
|
+
| \\_ | _ |
|
18
|
+
| \\textasciitilde{} | ~ |
|
19
|
+
| \\textbackslash{} | \\ |
|
20
|
+
| \\textasciicircum{} | ^ |
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'latex/decode'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
Feature: Decode LaTeX umlauts
|
2
|
+
As a hacker who works with LaTeX
|
3
|
+
I want to be able to decode LaTeX umlauts
|
4
|
+
|
5
|
+
Scenario: Lowercase german umlauts
|
6
|
+
When I decode the string '\"a\"o\"u'
|
7
|
+
Then the result should be 'äöü'
|
8
|
+
|
9
|
+
Scenario: Uppercase german umlauts
|
10
|
+
When I decode the string '\"A\"O\"U'
|
11
|
+
Then the result should be 'ÄÖÜ'
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib/', __FILE__)
|
3
|
+
$:.unshift lib unless $:.include?(lib)
|
4
|
+
|
5
|
+
require 'latex/decode/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'latex-decode'
|
9
|
+
s.version = LaTeX::Decode::VERSION.dup
|
10
|
+
s.authors = ['Sylvester Keil']
|
11
|
+
s.email = ['http://sylvester.keil.or.at']
|
12
|
+
s.homepage = 'http://github.com/inukshuk/latex-decode'
|
13
|
+
s.summary = 'Decodes LaTeX to Unicode.'
|
14
|
+
s.description = 'Decodes strings formatted in LaTeX to equivalent Unicode strings.'
|
15
|
+
s.license = 'GPL-3'
|
16
|
+
|
17
|
+
if RUBY_PLATFORM =~ /java/
|
18
|
+
s.platform = 'java'
|
19
|
+
else
|
20
|
+
s.add_dependency('unicode', '~> 0.4')
|
21
|
+
s.platform = 'ruby'
|
22
|
+
end
|
23
|
+
|
24
|
+
s.add_development_dependency('rake', '~> 0.8')
|
25
|
+
s.add_development_dependency('bundler', '~> 1.0')
|
26
|
+
s.add_development_dependency('rdoc', '~> 3.6')
|
27
|
+
s.add_development_dependency('rspec', '~> 2.6')
|
28
|
+
s.add_development_dependency('cucumber', "~> 1.0")
|
29
|
+
|
30
|
+
s.files = `git ls-files`.split("\n")
|
31
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
32
|
+
s.executables = []
|
33
|
+
s.require_path = 'lib'
|
34
|
+
|
35
|
+
s.rdoc_options = %w{--line-numbers --inline-source --title "LaTeX-Decode\ Documentation" --main README.md --webcvs=http://github.com/inukshuk/latex-decode/tree/master/}
|
36
|
+
s.extra_rdoc_files = %w{README.md LICENSE}
|
37
|
+
end
|
38
|
+
|
39
|
+
# vim: syntax=ruby
|
data/lib/latex/decode.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
# LaTeX::Decode
|
3
|
+
# Copyright (C) 2011 Sylvester Keil <sylvester.keil.or.at>
|
4
|
+
# Copyright (C) 2010 François Charette
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require 'latex/decode/version'
|
21
|
+
require 'latex/decode/compatibility'
|
22
|
+
require 'latex/decode/base'
|
23
|
+
|
24
|
+
require 'latex/decode/accents'
|
25
|
+
require 'latex/decode/diacritics'
|
26
|
+
require 'latex/decode/punctuation'
|
27
|
+
require 'latex/decode/symbols'
|
28
|
+
|
29
|
+
module LaTeX
|
30
|
+
|
31
|
+
class << self
|
32
|
+
def decode (string)
|
33
|
+
return string unless string.respond_to?(:to_s)
|
34
|
+
|
35
|
+
string = string.is_a?(String) ? string.dup : string.to_s
|
36
|
+
|
37
|
+
Decode::Base.normalize(string)
|
38
|
+
|
39
|
+
Decode::Accents.decode!(string)
|
40
|
+
Decode::Diacritics.decode!(string)
|
41
|
+
Decode::Punctuation.decode!(string)
|
42
|
+
Decode::Symbols.decode!(string)
|
43
|
+
|
44
|
+
Decode::Base.strip_braces(string)
|
45
|
+
|
46
|
+
LaTeX.normalize_C(string)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module LaTeX
|
4
|
+
module Decode
|
5
|
+
|
6
|
+
class Accents < Decoder
|
7
|
+
@map = Hash[*%W{
|
8
|
+
` \u0300
|
9
|
+
' \u0301
|
10
|
+
^ \u0302
|
11
|
+
~ \u0303
|
12
|
+
= \u0304
|
13
|
+
. \u0307
|
14
|
+
'' \u0308
|
15
|
+
" \u0308
|
16
|
+
}.map { |s| LaTeX.to_unicode(s) }].freeze
|
17
|
+
|
18
|
+
@patterns = [
|
19
|
+
ruby_18 {
|
20
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })\{([[:alpha:]]+)\}/ou
|
21
|
+
} ||
|
22
|
+
ruby_19 {
|
23
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })\{(\p{L}\p{M}*)\}/ou
|
24
|
+
},
|
25
|
+
ruby_18 {
|
26
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })([[:alpha:]])/ou
|
27
|
+
} ||
|
28
|
+
ruby_19 {
|
29
|
+
/\\(#{ map.keys.map { |k| Regexp.escape(k) }.join('|') })(\p{L}\p{M}*)/ou
|
30
|
+
}
|
31
|
+
].freeze
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module LaTeX
|
4
|
+
module Decode
|
5
|
+
|
6
|
+
class Decoder
|
7
|
+
class << self
|
8
|
+
attr_reader :patterns, :map
|
9
|
+
|
10
|
+
def inherited (base)
|
11
|
+
subclasses << base
|
12
|
+
end
|
13
|
+
|
14
|
+
def subclasses
|
15
|
+
@subclasses ||= []
|
16
|
+
end
|
17
|
+
|
18
|
+
def decode (string)
|
19
|
+
decode!(string.dup)
|
20
|
+
end
|
21
|
+
|
22
|
+
def decode! (string)
|
23
|
+
puts name unless patterns
|
24
|
+
patterns.each do |pattern|
|
25
|
+
string.gsub!(pattern) { |m| [$2,map[$1],$3].compact.join }
|
26
|
+
end
|
27
|
+
string
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module Base
|
33
|
+
|
34
|
+
module_function
|
35
|
+
|
36
|
+
def normalize (string)
|
37
|
+
string.gsub!(/\\(?:i|j)\b/) { |m| m == '\\i' ? 'ı' : 'ȷ' }
|
38
|
+
|
39
|
+
# \foo\ bar -> \foo{} bar
|
40
|
+
string.gsub!(/(\\[a-zA-Z]+)\\(\s+)/, '\1{}\2')
|
41
|
+
|
42
|
+
# Aaaa\o, -> Aaaa\o{},
|
43
|
+
string.gsub!(/([^{]\\\w)([;,.:%])/, '\1{}\2')
|
44
|
+
|
45
|
+
# \c cb -> \c{cb}
|
46
|
+
string.gsub!(/(\\[^\sij&#\$\{\}_~%])\s+([[:alpha:]]+)\b/i, '\1{\2}')
|
47
|
+
|
48
|
+
string
|
49
|
+
end
|
50
|
+
|
51
|
+
def strip_braces (string)
|
52
|
+
string.gsub!(/(^|[^\\])([\{\}]+)/, '\1')
|
53
|
+
string.gsub!(/\\(\{|\})/, '\1')
|
54
|
+
string
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
if RUBY_VERSION < "1.9"
|
3
|
+
$KCODE = 'U'
|
4
|
+
|
5
|
+
module LaTeX
|
6
|
+
def self.to_unicode (string)
|
7
|
+
string.gsub(/\\?u([\da-f]{4})/i) { |m| [$1.to_i(16)].pack('U') }
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def ruby_18; yield; end
|
12
|
+
def ruby_19; false; end
|
13
|
+
else
|
14
|
+
|
15
|
+
module LaTeX
|
16
|
+
def self.to_unicode (string); string; end
|
17
|
+
end
|
18
|
+
|
19
|
+
def ruby_18; false; end
|
20
|
+
def ruby_19; yield; end
|
21
|
+
end
|
22
|
+
|
23
|
+
if RUBY_PLATFORM == 'java'
|
24
|
+
require 'java'
|
25
|
+
|
26
|
+
# Use the Java native Unicode normalizer
|
27
|
+
module LaTeX
|
28
|
+
def self.normalize_C(string)
|
29
|
+
java.text.Normalizer.normalize(string, java.text.Normalizer::Form::NFC).to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
else
|
34
|
+
|
35
|
+
begin
|
36
|
+
require 'unicode'
|
37
|
+
|
38
|
+
# Use the Unicode gem
|
39
|
+
module LaTeX
|
40
|
+
def self.normalize_C(string)
|
41
|
+
Unicode::normalize_C(string)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
rescue LoadError
|
45
|
+
begin
|
46
|
+
require 'active_support/multibyte/chars'
|
47
|
+
|
48
|
+
# Use ActiveSupport's normalizer
|
49
|
+
module LaTeX
|
50
|
+
def self.normalize_C(string)
|
51
|
+
ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
|
52
|
+
end
|
53
|
+
end
|
54
|
+
rescue LoadError
|
55
|
+
fail "Failed to load unicode normalizer: please gem install unicode (or active_support)"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|