rind 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/CHANGELOG.rdoc +5 -0
  2. data/lib/rind/parser.rb +41 -9
  3. metadata +4 -4
data/CHANGELOG.rdoc CHANGED
@@ -1,5 +1,10 @@
1
+ == 0.1.2 - 2010.06.17
2
+ * When parsing HTML, script and style tags will not break because of a "<" in their content.
3
+ * Tag names can now have numbers. Previously heading tags would not parse correctly.
4
+
1
5
  == 0.1.1 - 2010.06.14
2
6
  * Attributes without a value or with a value containing a space broke the parser.
7
+ * Text nodes with newlines were being cleared when they should not have been.
3
8
 
4
9
  == 0.1.0 - 2010.06.12
5
10
  * Initial release.
data/lib/rind/parser.rb CHANGED
@@ -15,18 +15,25 @@ module Rind
15
15
  content = File.read(file_name)
16
16
 
17
17
  # tag types
18
- name = /[a-zA-Z_]/
18
+ name = /[a-zA-Z_0-9]/
19
19
  cdata = /<!\[CDATA\[(.*?)\]\]>/m
20
20
  comment = /<!--(.*?)-->/m
21
21
  doctype = /<!DOCTYPE(.*?)>/m
22
22
  processing_instruction = /<\?(.*?)>/m
23
+ full_tag = /<\s*(script|style)\s*(.*?)>(.*?)<\s*\/\s*\5\s*>/m
23
24
  end_tag = /<\s*\/\s*((?:#{name}+:)?#{name}+)\s*>/m
24
- start_tag = /<\s*((?:#{name}+:)?#{name}+)\s*(.*?)?\/?>/m
25
+ start_tag = /<\s*((?:#{name}+:)?#{name}+)\s*(.*?)\/?\s*>/m
26
+
27
+ if type == 'html'
28
+ scan_regex = /#{cdata}|#{comment}|#{doctype}|#{processing_instruction}|#{full_tag}|#{end_tag}|#{start_tag}/o
29
+ else # xml
30
+ scan_regex = /#{cdata}|#{comment}|#{doctype}|#{processing_instruction}|#{end_tag}|#{start_tag}/o
31
+ end
25
32
 
26
33
  # extract tokens from the file content
27
34
  tokens = Array.new
28
35
  text_start = 0
29
- content.scan(/#{cdata}|#{comment}|#{doctype}|#{processing_instruction}|#{end_tag}|#{start_tag}/o) do |token|
36
+ content.scan(scan_regex) do |token|
30
37
  # remove nil entries from the unmatched tag checks
31
38
  token.compact!
32
39
  # get match object
@@ -41,17 +48,42 @@ module Rind
41
48
  text_start = match.end(0)
42
49
 
43
50
  # create a token for the appropriate tag
44
- if match.begin(1) # cdata
51
+ if match.begin(1)
45
52
  tokens.push([CDATA, token].flatten)
46
- elsif match.begin(2) # comment
53
+ elsif match.begin(2)
47
54
  tokens.push([COMMENT, token].flatten)
48
- elsif match.begin(3) # doctype tag
55
+ elsif match.begin(3)
49
56
  tokens.push([DOCTYPE, token].flatten)
50
- elsif match.begin(4) # processing instruction
57
+ elsif match.begin(4)
51
58
  tokens.push([PRO_INST, token].flatten)
52
- elsif match.begin(5) # end tag
59
+ # from here things vary a little
60
+ #
61
+ # html => full tag = 5, end tag = 8, start tag = 9
62
+ # xml => end tag = 5, start tag = 6
63
+ elsif match.begin(5)
64
+ if type == 'html'
65
+ if token[2].nil?
66
+ attr = nil
67
+ text = token[1]
68
+ else
69
+ attr = token[1]
70
+ text = token[2]
71
+ end
72
+ tokens.push([START_TAG, token[0], attr])
73
+ if text.sub!(/\A\s*#{comment}\s*\z/o, '\1')
74
+ tokens.push([COMMENT, text])
75
+ elsif text !~ /\A\s*\z/
76
+ tokens.push([TEXT, text])
77
+ end
78
+ tokens.push([END_TAG, token[0]])
79
+ else
80
+ tokens.push([END_TAG, token].flatten)
81
+ end
82
+ elsif match.begin(6)
83
+ tokens.push([START_TAG, token].flatten)
84
+ elsif match.begin(8)
53
85
  tokens.push([END_TAG, token].flatten)
54
- elsif match.begin(6) # start tag
86
+ elsif match.begin(9)
55
87
  tokens.push([START_TAG, token].flatten)
56
88
  end
57
89
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rind
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 2
10
+ version: 0.1.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Aaron Lasseigne
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-06-14 00:00:00 -05:00
18
+ date: 2010-06-17 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies: []
21
21