rind 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/CHANGELOG.rdoc +5 -0
  2. data/lib/rind/parser.rb +41 -9
  3. metadata +4 -4
data/CHANGELOG.rdoc CHANGED
@@ -1,5 +1,10 @@
1
+ == 0.1.2 - 2010.06.17
2
+ * When parsing HTML, script and style tags will not break because of a "<" in their content.
3
+ * Tag names can now have numbers. Previously heading tags would not parse correctly.
4
+
1
5
  == 0.1.1 - 2010.06.14
2
6
  * Attributes without a value or with a value containing a space broke the parser.
7
+ * Text nodes with newlines were being cleared when they should not have been.
3
8
 
4
9
  == 0.1.0 - 2010.06.12
5
10
  * Initial release.
data/lib/rind/parser.rb CHANGED
@@ -15,18 +15,25 @@ module Rind
15
15
  content = File.read(file_name)
16
16
 
17
17
  # tag types
18
- name = /[a-zA-Z_]/
18
+ name = /[a-zA-Z_0-9]/
19
19
  cdata = /<!\[CDATA\[(.*?)\]\]>/m
20
20
  comment = /<!--(.*?)-->/m
21
21
  doctype = /<!DOCTYPE(.*?)>/m
22
22
  processing_instruction = /<\?(.*?)>/m
23
+ full_tag = /<\s*(script|style)\s*(.*?)>(.*?)<\s*\/\s*\5\s*>/m
23
24
  end_tag = /<\s*\/\s*((?:#{name}+:)?#{name}+)\s*>/m
24
- start_tag = /<\s*((?:#{name}+:)?#{name}+)\s*(.*?)?\/?>/m
25
+ start_tag = /<\s*((?:#{name}+:)?#{name}+)\s*(.*?)\/?\s*>/m
26
+
27
+ if type == 'html'
28
+ scan_regex = /#{cdata}|#{comment}|#{doctype}|#{processing_instruction}|#{full_tag}|#{end_tag}|#{start_tag}/o
29
+ else # xml
30
+ scan_regex = /#{cdata}|#{comment}|#{doctype}|#{processing_instruction}|#{end_tag}|#{start_tag}/o
31
+ end
25
32
 
26
33
  # extract tokens from the file content
27
34
  tokens = Array.new
28
35
  text_start = 0
29
- content.scan(/#{cdata}|#{comment}|#{doctype}|#{processing_instruction}|#{end_tag}|#{start_tag}/o) do |token|
36
+ content.scan(scan_regex) do |token|
30
37
  # remove nil entries from the unmatched tag checks
31
38
  token.compact!
32
39
  # get match object
@@ -41,17 +48,42 @@ module Rind
41
48
  text_start = match.end(0)
42
49
 
43
50
  # create a token for the appropriate tag
44
- if match.begin(1) # cdata
51
+ if match.begin(1)
45
52
  tokens.push([CDATA, token].flatten)
46
- elsif match.begin(2) # comment
53
+ elsif match.begin(2)
47
54
  tokens.push([COMMENT, token].flatten)
48
- elsif match.begin(3) # doctype tag
55
+ elsif match.begin(3)
49
56
  tokens.push([DOCTYPE, token].flatten)
50
- elsif match.begin(4) # processing instruction
57
+ elsif match.begin(4)
51
58
  tokens.push([PRO_INST, token].flatten)
52
- elsif match.begin(5) # end tag
59
+ # from here things vary a little
60
+ #
61
+ # html => full tag = 5, end tag = 8, start tag = 9
62
+ # xml => end tag = 5, start tag = 6
63
+ elsif match.begin(5)
64
+ if type == 'html'
65
+ if token[2].nil?
66
+ attr = nil
67
+ text = token[1]
68
+ else
69
+ attr = token[1]
70
+ text = token[2]
71
+ end
72
+ tokens.push([START_TAG, token[0], attr])
73
+ if text.sub!(/\A\s*#{comment}\s*\z/o, '\1')
74
+ tokens.push([COMMENT, text])
75
+ elsif text !~ /\A\s*\z/
76
+ tokens.push([TEXT, text])
77
+ end
78
+ tokens.push([END_TAG, token[0]])
79
+ else
80
+ tokens.push([END_TAG, token].flatten)
81
+ end
82
+ elsif match.begin(6)
83
+ tokens.push([START_TAG, token].flatten)
84
+ elsif match.begin(8)
53
85
  tokens.push([END_TAG, token].flatten)
54
- elsif match.begin(6) # start tag
86
+ elsif match.begin(9)
55
87
  tokens.push([START_TAG, token].flatten)
56
88
  end
57
89
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rind
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 2
10
+ version: 0.1.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Aaron Lasseigne
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-14 00:00:00 -05:00
18
+ date: 2010-06-17 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies: []
21
21