entities 0.2.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/compile.js ADDED
@@ -0,0 +1,64 @@
1
+ var modes = ["XML", "HTML4", "HTML5"];
2
+
3
+ modes.reduce(function(prev, name, i){
4
+ var obj = require("./entities/" + name.toLowerCase() + ".json");
5
+
6
+ if(prev){
7
+ Object.keys(prev).forEach(function(name){
8
+ obj[name] = prev[name];
9
+ });
10
+ }
11
+
12
+ var inverse = getInverse(obj);
13
+
14
+ module.exports[name] = {
15
+ strict: getStrictReplacer(obj),
16
+ //there is no non-strict mode for XML
17
+ normal: i === 0 ? null : getReplacer(obj),
18
+ inverse: getInverseReplacer(inverse),
19
+ inverseObj: inverse,
20
+ obj: obj
21
+ };
22
+
23
+ return obj;
24
+ }, null);
25
+
26
+ function sortDesc(a, b){
27
+ return a < b ? 1 : -1;
28
+ }
29
+
30
+ function getReplacer(obj){
31
+ var keys = Object.keys(obj).sort(sortDesc);
32
+ var re = keys.join("|")//.replace(/(\w+);\|\1/g, "$1;?");
33
+
34
+ // also match hex and char codes
35
+ re += "|#[xX][\\da-fA-F]+;?|#\\d+;?";
36
+
37
+ return new RegExp("&(?:" + re + ")", "g");
38
+ }
39
+
40
+ function getStrictReplacer(obj){
41
+ var keys = Object.keys(obj).sort(sortDesc).filter(RegExp.prototype.test, /;$/);
42
+ var re = keys.map(function(name){
43
+ return name.slice(0, -1); //remove trailing semicolon
44
+ }).join("|");
45
+
46
+ // also match hex and char codes
47
+ re += "|#[xX][\\da-fA-F]+|#\\d+";
48
+
49
+ return new RegExp("&(?:" + re + ");", "g");
50
+ }
51
+
52
+ function getInverse(obj){
53
+ return Object.keys(obj).filter(function(name){
54
+ //prefer identifiers with a semicolon
55
+ return name.substr(-1) === ";" || obj[name + ";"] !== obj[name];
56
+ }).reduce(function(inverse, name){
57
+ inverse[obj[name]] = name;
58
+ return inverse;
59
+ }, {});
60
+ }
61
+
62
+ function getInverseReplacer(inverse){
63
+ return new RegExp("\\" + Object.keys(inverse).sort().join("|\\"), "g");
64
+ }
package/index.js CHANGED
@@ -1,77 +1,99 @@
1
- var re_notUTF8 = /[\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]/g,
2
- charCode_func = function(c){ return "&#" + c.charCodeAt(0) + ";";};
1
+ var compiled = require("./compile.js"),
2
+ modes = ["XML", "HTML4", "HTML5"];
3
3
 
4
- var fetch = function(filename, inherits){
5
- var obj = require("./entities/" + filename + ".json");
6
-
7
- if(inherits) for(var name in inherits) obj[name] = inherits[name];
8
-
9
- var re = Object.keys(obj).sort().join("|").replace(/(\w+)\|\1;/g, "$1;?");
10
-
11
- // also match hex and char codes
12
- re += "|#[xX][0-9a-fA-F]+;?|#\\d+;?";
4
+ var levels = modes.map(function(name, i){
5
+ var obj = compiled[name],
6
+ strict = genReplaceFunc(obj.strict, getStrictReplacer(obj.obj)),
7
+ //there is no non-strict mode for XML
8
+ normal = i === 0 ? strict : genReplaceFunc(obj.normal, getReplacer(obj.obj)),
9
+ inverse = getInverse(obj.inverseObj, obj.inverse);
10
+
11
+ exports["decode" + name + "Strict"] = strict;
12
+ exports["decode" + name] = normal;
13
+ exports["encode" + name] = inverse;
13
14
 
14
15
  return {
15
- func: function(name){
16
- if (name.charAt(1) === "#") {
17
- if (name.charAt(2).toLowerCase() === "x") {
18
- return String.fromCharCode(parseInt(name.substr(3), 16));
19
- }
20
- return String.fromCharCode(parseInt(name.substr(2), 10));
21
- }
22
- return obj[name.substr(1)];
23
- },
24
- re: new RegExp("&(?:" + re + ")", "g"),
25
- obj: obj
16
+ strict: strict,
17
+ normal: normal,
18
+ inverse: inverse
26
19
  };
20
+ });
21
+
22
+ var decode = levels.map(function(l){ return l.normal; }),
23
+ decodeStrict = levels.map(function(l){ return l.strict; }),
24
+ encode = levels.map(function(l){ return l.inverse; });
25
+
26
+ exports.decode = function(data, level){
27
+ if(!(level >= 0 && level < 3)) level = 0;
28
+ return decode[level](data);
29
+ };
30
+ exports.decodeStrict = function(data, level){
31
+ if(!(level >= 0 && level < 3)) level = 0;
32
+ return decodeStrict[level](data);
33
+ };
34
+ exports.encode = function(data, level){
35
+ if(!(level >= 0 && level < 3)) level = 0;
36
+ return encode[level](data);
27
37
  };
28
38
 
29
- var getReverse = function(obj){
30
- var reverse = Object.keys(obj).reduce(function(reverse, name){
31
- reverse[obj[name]] = name;
32
- return reverse;
33
- }, {});
34
-
35
- return {
36
- func: function(name){ return "&" + reverse[name]; },
37
- re: new RegExp("\\" + Object.keys(reverse).sort().join("|\\"), "g")
39
+ function getReplacer(obj){
40
+ return function normalReplacer(name){
41
+ if(name.charAt(1) === "#"){
42
+ if(name.charAt(2).toLowerCase() === "x"){
43
+ return codePointToSymbol(parseInt(name.substr(3), 16));
44
+ }
45
+ return codePointToSymbol(parseInt(name.substr(2), 10));
46
+ }
47
+ return obj[name.substr(1)];
38
48
  };
39
- };
49
+ }
40
50
 
41
- var modes = ["XML", "HTML4", "HTML5"];
51
+ function codePointToSymbol(entity){
52
+ return String.fromCharCode(entity); //TODO
53
+ }
42
54
 
43
- module.exports = {
44
- decode: function(data, level){
45
- if(!modes[level]) level = 0;
46
- return module.exports["decode" + modes[level]](data);
47
- },
48
- encode: function(data, level){
49
- if(!modes[level]) level = 0;
50
- return module.exports["encode" + modes[level]](data);
51
- }
52
- };
55
+ function getStrictReplacer(obj){
56
+ return function strictReplacer(name){
57
+ if(name.charAt(1) === "#"){
58
+ if(name.charAt(2).toLowerCase() === "x"){
59
+ return String.fromCharCode(parseInt(name.substr(3), 16));
60
+ }
61
+ return String.fromCharCode(parseInt(name.substr(2), 10));
62
+ }
63
+ return obj[name.substr(1)];
64
+ };
65
+ }
66
+
67
+ var re_nonASCII = /[^\0-\x7F]/g,
68
+ re_astralSymbols = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;
69
+
70
+ function nonUTF8Replacer(c){
71
+ return "&#x" + c.charCodeAt(0).toString(16).toUpperCase() + ";";
72
+ }
53
73
 
54
- var tmp;
74
+ function astralReplacer(c){
75
+ // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
76
+ var high = c.charCodeAt(0);
77
+ var low = c.charCodeAt(1);
78
+ var codePoint = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000;
79
+ return "&#x" + codePoint.toString(16).toUpperCase() + ";";
80
+ }
55
81
 
56
- modes.forEach(function(name){
57
- var obj = fetch(name.toLowerCase(), tmp),
58
- regex = obj.re,
59
- func = obj.func;
60
-
61
- tmp = obj.obj;
62
-
63
- module.exports["decode" + name] = function(data){
82
+ function getInverse(inverse, re){
83
+ function func(name){
84
+ return "&" + inverse[name];
85
+ }
86
+
87
+ return function(data){
64
88
  return data
65
- .replace(regex, func);
89
+ .replace(re, func)
90
+ .replace(re_astralSymbols, astralReplacer)
91
+ .replace(re_nonASCII, nonUTF8Replacer);
66
92
  };
67
-
68
- var reverse = getReverse(obj.obj),
69
- reverse_re = reverse.re,
70
- reverse_func = reverse.func;
71
-
72
- module.exports["encode" + name] = function(data){
73
- return data
74
- .replace(reverse_re, reverse_func)
75
- .replace(re_notUTF8, charCode_func);
93
+ }
94
+
95
+ function genReplaceFunc(regex, func){
96
+ return function(data){
97
+ return data.replace(regex, func);
76
98
  };
77
- });
99
+ }
package/package.json CHANGED
@@ -1,22 +1,27 @@
1
1
  {
2
- "name": "entities",
3
- "version": "0.2.2",
4
- "description": "Encode & decode XML/HTML entities with ease",
5
- "author": "Felix Boehm <me@feedic.com>",
6
- "keywords": ["html", "xml", "entity", "encoding"],
7
- "main": "./index.js",
8
- "directories": {
9
- "test": "test"
10
- },
11
- "devDependencies": {
12
- "mocha": "~1.9.0"
13
- },
14
- "scripts": {
15
- "test": "mocha"
16
- },
17
- "repository": {
18
- "type": "git"
19
- , "url": "git://github.com/fb55/node-entities.git"
20
- },
21
- "license": "BSD-like"
2
+ "name": "entities",
3
+ "version": "0.5.0",
4
+ "description": "Encode & decode XML/HTML entities with ease",
5
+ "author": "Felix Boehm <me@feedic.com>",
6
+ "keywords": [
7
+ "html",
8
+ "xml",
9
+ "entity",
10
+ "encoding"
11
+ ],
12
+ "main": "./index.js",
13
+ "directories": {
14
+ "test": "test"
15
+ },
16
+ "devDependencies": {
17
+ "mocha": "~1.9.0"
18
+ },
19
+ "scripts": {
20
+ "test": "mocha"
21
+ },
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "git://github.com/fb55/node-entities.git"
25
+ },
26
+ "license": "BSD-like"
22
27
  }
package/readme.md CHANGED
@@ -6,13 +6,12 @@ En- & decoder for XML/HTML entities.
6
6
  * Focussed on ___speed___
7
7
  * Supports three levels of entities: __XML__, __HTML4__ & __HTML5__
8
8
  * Supports _char code_ entities (eg. `&#x55;`)
9
- * Special optimizations for XML: A more restrictive syntax allows faster parsing
10
9
 
11
10
  ##How to…
12
11
 
13
12
  ###…install `entities`
14
13
 
15
- npm install entities
14
+ npm i entities
16
15
 
17
16
  ###…use `entities`
18
17
 
@@ -26,6 +25,3 @@ require("entities").decode(<str> data[, <int> level]);
26
25
  The `level` attribute indicates what level of entities should be decoded (0 = XML, 1 = HTML4 and 2 = HTML5). The default is 0 (read: XML).
27
26
 
28
27
  There are also methods to access the level directly. Just append the name of the level to the action and you're ready to go (e.g. `encodeHTML4(data)`, `decodeXML(data)`).
29
-
30
- ##TODO
31
- * There should be a way to remove tables that aren't used. The HTML5 table is pretty heavy, if it's not needed, it shouldn't be kept in memory.
package/test/test.js CHANGED
@@ -1,65 +1,140 @@
1
- var assert = require('assert');
2
- var entities = require('../');
1
+ var assert = require("assert"),
2
+ path = require("path"),
3
+ entities = require('../');
3
4
 
4
- describe("Encode->decode test", function() {
5
- var testcases = [
6
- { input: "asdf & ÿ ü '",
7
- xml: "asdf &amp; &#255; &#252; &apos;",
8
- html4: "asdf &amp; &yuml &uuml &apos;",
9
- html5: "asdf &amp; &yuml &uuml &apos;" },
10
- { input: '&#38;',
11
- xml: '&amp;#38;',
12
- html4: '&amp;#38;',
13
- html5: '&amp;&num;38&semi;' },
14
- ];
15
- testcases.forEach(function(tc) {
16
- var encodedXML = entities.encodeXML(tc.input);
17
- it('should XML encode '+tc.input, function() {
18
- assert.equal(encodedXML, tc.xml);
19
- });
20
- it('should XML decode '+encodedXML, function() {
21
- assert.equal(entities.decodeXML(encodedXML), tc.input);
22
- });
23
- var encodedHTML4 = entities.encodeHTML4(tc.input);
24
- it('should HTML4 encode '+tc.input, function() {
25
- assert.equal(encodedHTML4, tc.html4);
26
- });
27
- it('should HTML4 decode '+encodedHTML4, function() {
28
- assert.equal(entities.decodeHTML4(encodedHTML4), tc.input);
29
- });
30
- var encodedHTML5 = entities.encodeHTML5(tc.input);
31
- it('should HTML5 encode '+tc.input, function() {
32
- assert.equal(encodedHTML5, tc.html5);
33
- });
34
- it('should HTML5 decode '+encodedHTML5, function() {
35
- assert.equal(entities.decodeHTML5(encodedHTML5), tc.input);
36
- });
37
- });
5
+ describe("Encode->decode test", function(){
6
+ var testcases = [
7
+ {
8
+ input: "asdf & ÿ ü '",
9
+ xml: "asdf &amp; &#xFF; &#xFC; &apos;",
10
+ html4: "asdf &amp; &yuml; &uuml; &apos;",
11
+ html5: "asdf &amp; &yuml; &uuml; &apos;"
12
+ }, {
13
+ input: "&#38;",
14
+ xml: "&amp;#38;",
15
+ html4: "&amp;#38;",
16
+ html5: "&amp;&num;38&semi;"
17
+ },
18
+ ];
19
+ testcases.forEach(function(tc) {
20
+ var encodedXML = entities.encodeXML(tc.input);
21
+ it("should XML encode " + tc.input, function(){
22
+ assert.equal(encodedXML, tc.xml);
23
+ });
24
+ it("should default to XML encode " + tc.input, function(){
25
+ assert.equal(entities.encode(tc.input), tc.xml);
26
+ });
27
+ it("should XML decode " + encodedXML, function(){
28
+ assert.equal(entities.decodeXML(encodedXML), tc.input);
29
+ });
30
+ it("should default to XML encode " + encodedXML, function(){
31
+ assert.equal(entities.decode(encodedXML), tc.input);
32
+ });
33
+ it("should default strict to XML encode " + encodedXML, function(){
34
+ assert.equal(entities.decodeStrict(encodedXML), tc.input);
35
+ });
36
+
37
+ var encodedHTML4 = entities.encodeHTML4(tc.input);
38
+ it("should HTML4 encode " + tc.input, function(){
39
+ assert.equal(encodedHTML4, tc.html4);
40
+ });
41
+ it("should HTML4 decode " + encodedHTML4, function(){
42
+ assert.equal(entities.decodeHTML4(encodedHTML4), tc.input);
43
+ });
44
+
45
+ var encodedHTML5 = entities.encodeHTML5(tc.input);
46
+ it("should HTML5 encode " + tc.input, function(){
47
+ assert.equal(encodedHTML5, tc.html5);
48
+ });
49
+ it("should HTML5 decode " + encodedHTML5, function(){
50
+ assert.equal(entities.decodeHTML5(encodedHTML5), tc.input);
51
+ });
52
+ });
53
+ });
54
+
55
+ describe("Decode test", function(){
56
+ var testcases = [
57
+ { input: "&amp;amp;", output: "&amp;" },
58
+ { input: "&amp;#38;", output: "&#38;" },
59
+ { input: "&amp;#x26;", output: "&#x26;" },
60
+ { input: "&amp;#X26;", output: "&#X26;" },
61
+ { input: "&#38;#38;", output: "&#38;" },
62
+ { input: "&#x26;#38;", output: "&#38;" },
63
+ { input: "&#X26;#38;", output: "&#38;" },
64
+ { input: "&#x3a;", output: ":" },
65
+ { input: "&#x3A;", output: ":" },
66
+ { input: "&#X3a;", output: ":" },
67
+ { input: "&#X3A;", output: ":" }
68
+ ];
69
+ testcases.forEach(function(tc) {
70
+ it("should XML decode " + tc.input, function(){
71
+ assert.equal(entities.decodeXML(tc.input), tc.output);
72
+ });
73
+ it("should HTML4 decode " + tc.input, function(){
74
+ assert.equal(entities.decodeHTML4(tc.input), tc.output);
75
+ });
76
+ it("should HTML5 decode " + tc.input, function(){
77
+ assert.equal(entities.decodeHTML5(tc.input), tc.output);
78
+ });
79
+ });
80
+ });
81
+
82
+ var levels = ["xml", "html4", "html5"];
83
+
84
+ describe("Documents", function(){
85
+ levels
86
+ .map(function(n){ return path.join("..", "entities", n); })
87
+ .map(require)
88
+ .forEach(function(doc, i){
89
+ describe("Decode", function(){
90
+ it(levels[i], function(){
91
+ Object.keys(doc).forEach(function(e){
92
+ for(var l = i; l < levels.length; l++){
93
+ assert.equal(entities.decode("&" + e, l), doc[e]);
94
+ }
95
+ });
96
+ });
97
+ });
98
+
99
+ describe("Decode strict", function(){
100
+ it(levels[i], function(){
101
+ Object.keys(doc).forEach(function(e){
102
+ if(e.substr(-1) !== ";"){
103
+ assert.equal(entities.decodeStrict("&" + e, i), "&" + e);
104
+ return;
105
+ }
106
+ for(var l = i; l < levels.length; l++){
107
+ assert.equal(entities.decodeStrict("&" + e, l), doc[e]);
108
+ }
109
+ });
110
+ });
111
+ });
112
+
113
+ describe("Encode", function(){
114
+ it(levels[i], function(){
115
+ Object.keys(doc).forEach(function(e){
116
+ if(e.substr(-1) !== ";") return;
117
+ for(var l = i; l < levels.length; l++){
118
+ assert.equal(entities.decode(entities.encode(doc[e], l), l), doc[e]);
119
+ }
120
+ });
121
+ });
122
+ });
123
+ });
38
124
  });
39
125
 
40
- describe("Decode test", function() {
41
- var testcases = [
42
- { input: '&amp;amp;', output: '&amp;' },
43
- { input: '&amp;#38;', output: '&#38;' },
44
- { input: '&amp;#x26;', output: '&#x26;' },
45
- { input: '&amp;#X26;', output: '&#X26;' },
46
- { input: '&#38;#38;', output: '&#38;' },
47
- { input: '&#x26;#38;', output: '&#38;' },
48
- { input: '&#X26;#38;', output: '&#38;' },
49
- { input: '&#x3a;', output: ':' },
50
- { input: '&#x3A;', output: ':' },
51
- { input: '&#X3a;', output: ':' },
52
- { input: '&#X3A;', output: ':' }
53
- ];
54
- testcases.forEach(function(tc) {
55
- it('should XML decode '+tc.input, function() {
56
- assert.equal(entities.decodeXML(tc.input), tc.output);
57
- });
58
- it('should HTML4 decode '+tc.input, function() {
59
- assert.equal(entities.decodeHTML4(tc.input), tc.output);
60
- });
61
- it('should HTML5 decode '+tc.input, function() {
62
- assert.equal(entities.decodeHTML5(tc.input), tc.output);
63
- });
64
- });
126
+ var astral = {
127
+ "1D306": "\uD834\uDF06",
128
+ "1D11E": "\uD834\uDD1E"
129
+ };
130
+
131
+ describe("Astral entities", function(){
132
+ Object.keys(astral).forEach(function(c){
133
+ /*it("should decode " + astral[c], function(){
134
+ assert.equal(entities.decode("&#x" + c + ";"), astral[c]);
135
+ });*/
136
+ it("should encode " + astral[c], function(){
137
+ assert.equal(entities.encode(astral[c]), "&#x" + c + ";");
138
+ });
139
+ });
65
140
  });