oga 1.0.1-java → 1.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -363,16 +363,17 @@
363
363
  # body of an element is lexed using the `main` machine.
364
364
  #
365
365
 
366
- element_start = '<' ident_char;
367
- element_end = '</' identifier (':' identifier)* '>';
368
-
369
366
  action start_element {
370
367
  fhold;
371
368
  fnext element_name;
372
369
  }
373
370
 
371
+ action start_close_element {
372
+ fnext element_close;
373
+ }
374
+
374
375
  action close_element {
375
- callback_simple(id_on_element_end);
376
+ callback(id_on_element_end, data, encoding, ts, te);
376
377
  }
377
378
 
378
379
  action close_element_fnext_main {
@@ -381,6 +382,9 @@
381
382
  fnext main;
382
383
  }
383
384
 
385
+ element_start = '<' ident_char;
386
+ element_end = '</';
387
+
384
388
  # Machine used for lexing the name/namespace of an element.
385
389
  element_name := |*
386
390
  identifier ':' => {
@@ -393,6 +397,28 @@
393
397
  };
394
398
  *|;
395
399
 
400
+ # Machine used for lexing the closing tag of an element
401
+ element_close := |*
402
+ # namespace prefixes, currently not used but allows the rule below it
403
+ # to be used for the actual element name.
404
+ identifier ':';
405
+
406
+ identifier => close_element;
407
+
408
+ '>' => {
409
+ if ( lines > 0 )
410
+ {
411
+ advance_line(lines);
412
+
413
+ lines = 0;
414
+ }
415
+
416
+ fnext main;
417
+ };
418
+
419
+ any $count_newlines;
420
+ *|;
421
+
396
422
  # Characters that can be used for unquoted HTML attribute values.
397
423
  # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
398
424
  # for more info.
@@ -582,7 +608,7 @@
582
608
  cdata_start => start_cdata;
583
609
  proc_ins_start => start_proc_ins;
584
610
  element_start => start_element;
585
- element_end => close_element;
611
+ element_end => start_close_element;
586
612
  any => start_text;
587
613
  *|;
588
614
  }%%
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga.rb CHANGED
@@ -3,19 +3,19 @@ require 'set'
3
3
  require 'stringio'
4
4
  require 'thread'
5
5
 
6
- require_relative 'oga/version'
7
- require_relative 'oga/oga'
8
- require_relative 'oga/lru'
9
- require_relative 'oga/entity_decoder'
10
- require_relative 'oga/blacklist'
11
- require_relative 'oga/whitelist'
6
+ require 'oga/version'
7
+ require 'oga/oga'
8
+ require 'oga/lru'
9
+ require 'oga/entity_decoder'
10
+ require 'oga/blacklist'
11
+ require 'oga/whitelist'
12
12
 
13
13
  # Load these first so that the native extensions don't have to define the
14
14
  # Oga::XML namespace.
15
- require_relative 'oga/xml/lexer'
16
- require_relative 'oga/xml/parser'
15
+ require 'oga/xml/lexer'
16
+ require 'oga/xml/parser'
17
17
 
18
- require_relative 'liboga'
18
+ require 'liboga'
19
19
 
20
20
  #:nocov:
21
21
  if RUBY_PLATFORM == 'java'
@@ -23,35 +23,35 @@ if RUBY_PLATFORM == 'java'
23
23
  end
24
24
  #:nocov:
25
25
 
26
- require_relative 'oga/xml/html_void_elements'
27
- require_relative 'oga/xml/entities'
28
- require_relative 'oga/xml/querying'
29
- require_relative 'oga/xml/traversal'
30
- require_relative 'oga/xml/node'
31
- require_relative 'oga/xml/document'
32
- require_relative 'oga/xml/character_node'
33
- require_relative 'oga/xml/text'
34
- require_relative 'oga/xml/comment'
35
- require_relative 'oga/xml/cdata'
36
- require_relative 'oga/xml/xml_declaration'
37
- require_relative 'oga/xml/processing_instruction'
38
- require_relative 'oga/xml/doctype'
39
- require_relative 'oga/xml/namespace'
40
- require_relative 'oga/xml/default_namespace'
41
- require_relative 'oga/xml/attribute'
42
- require_relative 'oga/xml/element'
43
- require_relative 'oga/xml/node_set'
44
-
45
- require_relative 'oga/xml/sax_parser'
46
- require_relative 'oga/xml/pull_parser'
47
-
48
- require_relative 'oga/html/parser'
49
- require_relative 'oga/html/sax_parser'
50
- require_relative 'oga/html/entities'
51
-
52
- require_relative 'oga/xpath/lexer'
53
- require_relative 'oga/xpath/parser'
54
- require_relative 'oga/xpath/evaluator'
55
-
56
- require_relative 'oga/css/lexer'
57
- require_relative 'oga/css/parser'
26
+ require 'oga/xml/html_void_elements'
27
+ require 'oga/xml/entities'
28
+ require 'oga/xml/querying'
29
+ require 'oga/xml/traversal'
30
+ require 'oga/xml/node'
31
+ require 'oga/xml/document'
32
+ require 'oga/xml/character_node'
33
+ require 'oga/xml/text'
34
+ require 'oga/xml/comment'
35
+ require 'oga/xml/cdata'
36
+ require 'oga/xml/xml_declaration'
37
+ require 'oga/xml/processing_instruction'
38
+ require 'oga/xml/doctype'
39
+ require 'oga/xml/namespace'
40
+ require 'oga/xml/default_namespace'
41
+ require 'oga/xml/attribute'
42
+ require 'oga/xml/element'
43
+ require 'oga/xml/node_set'
44
+
45
+ require 'oga/xml/sax_parser'
46
+ require 'oga/xml/pull_parser'
47
+
48
+ require 'oga/html/parser'
49
+ require 'oga/html/sax_parser'
50
+ require 'oga/html/entities'
51
+
52
+ require 'oga/xpath/lexer'
53
+ require 'oga/xpath/parser'
54
+ require 'oga/xpath/evaluator'
55
+
56
+ require 'oga/css/lexer'
57
+ require 'oga/css/parser'
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '1.0.1'
2
+ VERSION = '1.0.2'
3
3
  end # Oga
data/lib/oga/xml/lexer.rb CHANGED
@@ -50,6 +50,10 @@ module Oga
50
50
  %w{thead tbody tfoot tr caption colgroup col}
51
51
  )
52
52
 
53
+ HTML_SCRIPT_ELEMENTS = Whitelist.new(%w{script template})
54
+
55
+ HTML_TABLE_ROW_ELEMENTS = Whitelist.new(%w{tr}) + HTML_SCRIPT_ELEMENTS
56
+
53
57
  # Elements that should be closed automatically before a new opening tag is
54
58
  # processed.
55
59
  HTML_CLOSE_SELF = {
@@ -59,8 +63,9 @@ module Oga
59
63
  'dt' => Blacklist.new(%w{dt dd}),
60
64
  'dd' => Blacklist.new(%w{dt dd}),
61
65
  'p' => Blacklist.new(%w{
62
- address article aside blockquote div dl fieldset footer form h1 h2 h3
63
- h4 h5 h6 header hgroup hr main nav ol p pre section table ul
66
+ address article aside blockquote details div dl fieldset figcaption
67
+ figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav
68
+ ol p pre section table ul
64
69
  }),
65
70
  'rb' => Blacklist.new(%w{rb rt rtc rp}),
66
71
  'rt' => Blacklist.new(%w{rb rt rtc rp}),
@@ -70,11 +75,11 @@ module Oga
70
75
  'option' => Blacklist.new(%w{optgroup option}),
71
76
  'colgroup' => Whitelist.new(%w{col template}),
72
77
  'caption' => HTML_TABLE_ALLOWED.to_blacklist,
73
- 'table' => HTML_TABLE_ALLOWED,
74
- 'thead' => Whitelist.new(%w{tr}),
75
- 'tbody' => Whitelist.new(%w{tr}),
76
- 'tfoot' => Whitelist.new(%w{tr}),
77
- 'tr' => Whitelist.new(%w{td th}),
78
+ 'table' => HTML_TABLE_ALLOWED + HTML_SCRIPT_ELEMENTS,
79
+ 'thead' => HTML_TABLE_ROW_ELEMENTS,
80
+ 'tbody' => HTML_TABLE_ROW_ELEMENTS,
81
+ 'tfoot' => HTML_TABLE_ROW_ELEMENTS,
82
+ 'tr' => Whitelist.new(%w{td th}) + HTML_SCRIPT_ELEMENTS,
78
83
  'td' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
79
84
  'th' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
80
85
  }
@@ -475,9 +480,19 @@ module Oga
475
480
  ##
476
481
  # Called on the closing tag of an element.
477
482
  #
478
- def on_element_end
483
+ # @param [String] ns_name The name of the element (minus namespace
484
+ # prefix). This is not set for self closing tags.
485
+ #
486
+ def on_element_end(name = nil)
479
487
  return if @elements.empty?
480
488
 
489
+ if html? and name and @elements.include?(name)
490
+ while current_element != name
491
+ add_token(:T_ELEM_END)
492
+ @elements.pop
493
+ end
494
+ end
495
+
481
496
  add_token(:T_ELEM_END)
482
497
 
483
498
  @elements.pop
metadata CHANGED
@@ -1,199 +1,199 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oga
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: java
6
6
  authors:
7
7
  - Yorick Peterse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-21 00:00:00.000000000 Z
11
+ date: 2015-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: ast
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - '>='
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
16
  - - '>='
23
17
  - !ruby/object:Gem::Version
24
18
  version: '0'
19
+ name: ast
25
20
  prerelease: false
26
21
  type: :runtime
27
- - !ruby/object:Gem::Dependency
28
- name: ruby-ll
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - ~>
24
+ - - '>='
32
25
  - !ruby/object:Gem::Version
33
- version: '2.1'
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
30
  - - ~>
37
31
  - !ruby/object:Gem::Version
38
32
  version: '2.1'
33
+ name: ruby-ll
39
34
  prerelease: false
40
35
  type: :runtime
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
36
  version_requirements: !ruby/object:Gem::Requirement
44
37
  requirements:
45
- - - '>='
38
+ - - ~>
46
39
  - !ruby/object:Gem::Version
47
- version: '0'
40
+ version: '2.1'
41
+ - !ruby/object:Gem::Dependency
48
42
  requirement: !ruby/object:Gem::Requirement
49
43
  requirements:
50
44
  - - '>='
51
45
  - !ruby/object:Gem::Version
52
46
  version: '0'
47
+ name: rake
53
48
  prerelease: false
54
49
  type: :development
55
- - !ruby/object:Gem::Dependency
56
- name: rspec
57
50
  version_requirements: !ruby/object:Gem::Requirement
58
51
  requirements:
59
- - - ~>
52
+ - - '>='
60
53
  - !ruby/object:Gem::Version
61
- version: '3.0'
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
62
56
  requirement: !ruby/object:Gem::Requirement
63
57
  requirements:
64
58
  - - ~>
65
59
  - !ruby/object:Gem::Version
66
60
  version: '3.0'
61
+ name: rspec
67
62
  prerelease: false
68
63
  type: :development
69
- - !ruby/object:Gem::Dependency
70
- name: yard
71
64
  version_requirements: !ruby/object:Gem::Requirement
72
65
  requirements:
73
- - - '>='
66
+ - - ~>
74
67
  - !ruby/object:Gem::Version
75
- version: '0'
68
+ version: '3.0'
69
+ - !ruby/object:Gem::Dependency
76
70
  requirement: !ruby/object:Gem::Requirement
77
71
  requirements:
78
72
  - - '>='
79
73
  - !ruby/object:Gem::Version
80
74
  version: '0'
75
+ name: yard
81
76
  prerelease: false
82
77
  type: :development
83
- - !ruby/object:Gem::Dependency
84
- name: simplecov
85
78
  version_requirements: !ruby/object:Gem::Requirement
86
79
  requirements:
87
80
  - - '>='
88
81
  - !ruby/object:Gem::Version
89
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
90
84
  requirement: !ruby/object:Gem::Requirement
91
85
  requirements:
92
86
  - - '>='
93
87
  - !ruby/object:Gem::Version
94
88
  version: '0'
89
+ name: simplecov
95
90
  prerelease: false
96
91
  type: :development
97
- - !ruby/object:Gem::Dependency
98
- name: kramdown
99
92
  version_requirements: !ruby/object:Gem::Requirement
100
93
  requirements:
101
94
  - - '>='
102
95
  - !ruby/object:Gem::Version
103
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
104
98
  requirement: !ruby/object:Gem::Requirement
105
99
  requirements:
106
100
  - - '>='
107
101
  - !ruby/object:Gem::Version
108
102
  version: '0'
103
+ name: kramdown
109
104
  prerelease: false
110
105
  type: :development
111
- - !ruby/object:Gem::Dependency
112
- name: benchmark-ips
113
106
  version_requirements: !ruby/object:Gem::Requirement
114
107
  requirements:
115
- - - ~>
108
+ - - '>='
116
109
  - !ruby/object:Gem::Version
117
- version: '2.0'
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
118
112
  requirement: !ruby/object:Gem::Requirement
119
113
  requirements:
120
114
  - - ~>
121
115
  - !ruby/object:Gem::Version
122
116
  version: '2.0'
117
+ name: benchmark-ips
123
118
  prerelease: false
124
119
  type: :development
125
- - !ruby/object:Gem::Dependency
126
- name: rake-compiler
127
120
  version_requirements: !ruby/object:Gem::Requirement
128
121
  requirements:
129
- - - '>='
122
+ - - ~>
130
123
  - !ruby/object:Gem::Version
131
- version: '0'
124
+ version: '2.0'
125
+ - !ruby/object:Gem::Dependency
132
126
  requirement: !ruby/object:Gem::Requirement
133
127
  requirements:
134
128
  - - '>='
135
129
  - !ruby/object:Gem::Version
136
130
  version: '0'
131
+ name: rake-compiler
137
132
  prerelease: false
138
133
  type: :development
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
139
  description: Oga is an XML/HTML parser written in Ruby.
140
140
  email: yorickpeterse@gmail.com
141
141
  executables: []
142
142
  extensions: []
143
143
  extra_rdoc_files: []
144
144
  files:
145
- - doc/manually_creating_documents.md
146
145
  - doc/xml_namespaces.md
146
+ - doc/manually_creating_documents.md
147
147
  - doc/css_selectors.md
148
148
  - doc/migrating_from_nokogiri.md
149
149
  - doc/css/common.css
150
150
  - lib/oga.rb
151
- - lib/oga/oga.rb
152
- - lib/oga/version.rb
151
+ - lib/oga/whitelist.rb
153
152
  - lib/oga/blacklist.rb
154
- - lib/oga/lru.rb
153
+ - lib/oga/version.rb
155
154
  - lib/oga/entity_decoder.rb
156
- - lib/oga/whitelist.rb
155
+ - lib/oga/lru.rb
156
+ - lib/oga/oga.rb
157
157
  - lib/oga/css/lexer.rb
158
158
  - lib/oga/css/parser.rb
159
- - lib/oga/html/sax_parser.rb
160
- - lib/oga/html/parser.rb
161
- - lib/oga/html/entities.rb
162
- - lib/oga/xml/lexer.rb
163
159
  - lib/oga/xml/namespace.rb
164
- - lib/oga/xml/processing_instruction.rb
165
- - lib/oga/xml/character_node.rb
166
- - lib/oga/xml/sax_parser.rb
167
- - lib/oga/xml/doctype.rb
168
- - lib/oga/xml/document.rb
169
- - lib/oga/xml/comment.rb
170
- - lib/oga/xml/default_namespace.rb
171
- - lib/oga/xml/text.rb
160
+ - lib/oga/xml/lexer.rb
172
161
  - lib/oga/xml/querying.rb
173
- - lib/oga/xml/attribute.rb
174
- - lib/oga/xml/pull_parser.rb
175
162
  - lib/oga/xml/parser.rb
176
- - lib/oga/xml/entities.rb
177
- - lib/oga/xml/html_void_elements.rb
163
+ - lib/oga/xml/traversal.rb
164
+ - lib/oga/xml/text.rb
178
165
  - lib/oga/xml/node.rb
166
+ - lib/oga/xml/document.rb
167
+ - lib/oga/xml/pull_parser.rb
179
168
  - lib/oga/xml/node_set.rb
169
+ - lib/oga/xml/sax_parser.rb
170
+ - lib/oga/xml/cdata.rb
180
171
  - lib/oga/xml/element.rb
172
+ - lib/oga/xml/character_node.rb
173
+ - lib/oga/xml/doctype.rb
174
+ - lib/oga/xml/html_void_elements.rb
175
+ - lib/oga/xml/entities.rb
176
+ - lib/oga/xml/default_namespace.rb
177
+ - lib/oga/xml/attribute.rb
181
178
  - lib/oga/xml/xml_declaration.rb
182
- - lib/oga/xml/cdata.rb
183
- - lib/oga/xml/traversal.rb
179
+ - lib/oga/xml/processing_instruction.rb
180
+ - lib/oga/xml/comment.rb
181
+ - lib/oga/html/parser.rb
182
+ - lib/oga/html/sax_parser.rb
183
+ - lib/oga/html/entities.rb
184
184
  - lib/oga/xpath/lexer.rb
185
- - lib/oga/xpath/evaluator.rb
186
185
  - lib/oga/xpath/parser.rb
187
- - ext/c/lexer.c
188
- - ext/c/lexer.rl
189
- - ext/c/lexer.h
190
- - ext/c/liboga.c
191
- - ext/c/extconf.rb
192
- - ext/c/liboga.h
186
+ - lib/oga/xpath/evaluator.rb
193
187
  - ext/ragel/base_lexer.rl
194
188
  - ext/java/Liboga.java
195
189
  - ext/java/org/liboga/xml/Lexer.java
196
190
  - ext/java/org/liboga/xml/Lexer.rl
191
+ - ext/c/extconf.rb
192
+ - ext/c/lexer.rl
193
+ - ext/c/lexer.h
194
+ - ext/c/liboga.c
195
+ - ext/c/lexer.c
196
+ - ext/c/liboga.h
197
197
  - README.md
198
198
  - LICENSE
199
199
  - oga.gemspec