censive 0.19 → 0.21
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/censive.gemspec +1 -1
- data/diagram/NFA to Regex.pdf +0 -0
- data/diagram/censive@ce9d51d.png +0 -0
- data/diagram/csv-ragel.dot +24 -0
- data/diagram/csv.dot +57 -0
- data/diagram/csv.png +0 -0
- data/diagram/csv.rl +45 -0
- data/diagram/csv.svg +270 -0
- data/diagram/diagram.dot +26 -0
- data/diagram/diagram.rl +50 -0
- data/lib/censive.rb +139 -81
- data/lib/test-censive.rb +12 -0
- data/lib/test-csv.rb +12 -0
- metadata +13 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5dffdaf597e038881e378eb30acb7c44cde08de1f9e40e2180076eaa11356c68
|
4
|
+
data.tar.gz: f9d7f77ac597a5d5a86fc1adcad430802ab20bd306bf5856f1191f57ff22f872
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0187489ebac8a9011f0f77dc9d52ca821ab080271f3eca6a1a40409b587534a9f4608d1f3b65a0253e587c242d01465e3cd773377f8d00b2fbd1723db4b5650
|
7
|
+
data.tar.gz: 94f2e7a204d8b40e058f41d193add0002d169d5d244e81c6895e465de159c6a953f09e313689891f7d12c05bead3baa41ad6fd525a8e297143758553e39ef1ba
|
data/censive.gemspec
CHANGED
Binary file
|
Binary file
|
@@ -0,0 +1,24 @@
|
|
1
|
+
digraph csv {
|
2
|
+
rankdir=LR;
|
3
|
+
node [ shape = point ];
|
4
|
+
ENTRY;
|
5
|
+
en_2;
|
6
|
+
eof_3;
|
7
|
+
node [ shape = circle, height = 0.2 ];
|
8
|
+
node [ fixedsize = true, height = 0.65, shape = doublecircle ];
|
9
|
+
2;
|
10
|
+
3;
|
11
|
+
node [ shape = circle ];
|
12
|
+
1 -> 1 [ label = "-128..-1, 1..'!', '#'..127" ];
|
13
|
+
1 -> 2 [ label = "'\"' / last2, initts" ];
|
14
|
+
2 -> 2 [ label = "0 / ts, last5, initts" ];
|
15
|
+
2 -> 2 [ label = "'\\n', '\\r' / ts, last4, initts" ];
|
16
|
+
2 -> 1 [ label = "'\"' / ts" ];
|
17
|
+
2 -> 2 [ label = "',' / ts, last3, initts" ];
|
18
|
+
2 -> 3 [ label = "DEF / ts" ];
|
19
|
+
3 -> 2 [ label = "0, '\\n', '\\r', '\"', ',' / next1, initts" ];
|
20
|
+
3 -> 3 [ label = "DEF" ];
|
21
|
+
ENTRY -> 2 [ label = "IN" ];
|
22
|
+
en_2 -> 2 [ label = "csv_scan" ];
|
23
|
+
3 -> eof_3 [ label = "EOF / next1" ];
|
24
|
+
}
|
data/diagram/csv.dot
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
digraph finite_state_machine {
|
2
|
+
rankdir=LR;
|
3
|
+
node [fontname="Helvetica,Arial,sans-serif", shape=circle, style=filled, fillcolor="#dddddd"];
|
4
|
+
edge [fontname="Helvetica,Arial,sans-serif"]
|
5
|
+
|
6
|
+
1 [label="1: StartRow"];
|
7
|
+
2 [label="2: InComment"];
|
8
|
+
3 [label="3: StartColumn", shape=doublecircle, fillcolor="#ffdddd"];
|
9
|
+
4 [label="4: InQuotedColumn"];
|
10
|
+
5 [label="5: InDoubleEscapedQuote"];
|
11
|
+
6 [label="6: InEscapedQuote"];
|
12
|
+
7 [label="7: InColumn"];
|
13
|
+
8 [label="8: EndColumnSeparator"];
|
14
|
+
9 [label="9: EndColumnRow", shape=doublecircle, fillcolor="#ffdddd"];
|
15
|
+
10 [label="10: InRowEnd", shape=doublecircle, fillcolor="#ffdddd"];
|
16
|
+
11 [label="11: CRLF"];
|
17
|
+
12 [label="12: EndRow"];
|
18
|
+
|
19
|
+
1 -> 1 [label="eol / discard"];
|
20
|
+
1 -> 2 [label="comment / discard"];
|
21
|
+
1 -> 3 [label="* / ε"];
|
22
|
+
|
23
|
+
2 -> 1 [label="LF / discard"];
|
24
|
+
2 -> 2 [label="* / discard"];
|
25
|
+
|
26
|
+
3 -> 4 [label="quote & @quoting / discard"];
|
27
|
+
3 -> 7 [label="* / copyout"];
|
28
|
+
3 -> 8 [label="sep / discard"];
|
29
|
+
3 -> 9 [label="eol / ε"]
|
30
|
+
|
31
|
+
4 -> 4 [label="* / copyout"];
|
32
|
+
4 -> 5 [label="quote & @quoting / discard"];
|
33
|
+
4 -> 6 [label="esc & @quoting / discard"];
|
34
|
+
|
35
|
+
5 -> 4 [label="quote & @quoting & @double-quote / copyout"];
|
36
|
+
5 -> 7 [label="* / copyout"];
|
37
|
+
5 -> 8 [label="sep / discard"];
|
38
|
+
5 -> 9 [label="eol / ε"]
|
39
|
+
|
40
|
+
6 -> 4 [label="* / copyout"];
|
41
|
+
|
42
|
+
7 -> 7 [label="* / copyout"];
|
43
|
+
7 -> 8 [label="sep / discard"];
|
44
|
+
7 -> 9 [label="eol / ε"]
|
45
|
+
|
46
|
+
8 -> 3 [label="* / ε"];
|
47
|
+
|
48
|
+
9 -> 10 [label="* / ε"];
|
49
|
+
|
50
|
+
10 -> 11 [label="CR & @isCRLF / discard"];
|
51
|
+
10 -> 12 [label="* / discard"];
|
52
|
+
|
53
|
+
11 -> 1 [label="* / ε"];
|
54
|
+
11 -> 1 [label="LF / discard"];
|
55
|
+
|
56
|
+
12 -> 1 [label="* / ε"];
|
57
|
+
}
|
data/diagram/csv.png
ADDED
Binary file
|
data/diagram/csv.rl
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
%%{
|
2
|
+
machine csv;
|
3
|
+
|
4
|
+
variable p s->p;
|
5
|
+
variable pe s->pe;
|
6
|
+
variable eof s->eof;
|
7
|
+
access s->;
|
8
|
+
|
9
|
+
EOF = 0;
|
10
|
+
EOL = [\r\n];
|
11
|
+
comma = [,];
|
12
|
+
string = [^,"\r\n\0]*;
|
13
|
+
quote = '"' [^"\0]* '"';
|
14
|
+
|
15
|
+
csv_scan := |*
|
16
|
+
|
17
|
+
string => {
|
18
|
+
return_token(TK_String);
|
19
|
+
fbreak;
|
20
|
+
};
|
21
|
+
|
22
|
+
quote => {
|
23
|
+
return_token(TK_Quote);
|
24
|
+
s->data += 1;
|
25
|
+
fbreak;
|
26
|
+
};
|
27
|
+
|
28
|
+
comma => {
|
29
|
+
return_token(TK_Comma);
|
30
|
+
fbreak;
|
31
|
+
};
|
32
|
+
|
33
|
+
EOL => {
|
34
|
+
s->curline += 1;
|
35
|
+
return_token(TK_EOL);
|
36
|
+
fbreak;
|
37
|
+
};
|
38
|
+
|
39
|
+
EOF => {
|
40
|
+
return_token(TK_EOF);
|
41
|
+
fbreak;
|
42
|
+
};
|
43
|
+
|
44
|
+
*|;
|
45
|
+
}%%
|
data/diagram/csv.svg
ADDED
@@ -0,0 +1,270 @@
|
|
1
|
+
<svg width="1063" height="1078" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0.00 0.00 2351.40 964.49">
|
2
|
+
<g id="graph0" class="graph" transform="translate(4.0000488281250455,960.4899951171875) scale(1)">
|
3
|
+
<title>finite_state_machine</title>
|
4
|
+
<polygon fill="white" stroke="transparent" points="-4,4 -4,-960.49 2347.4,-960.49 2347.4,4 -4,4"/>
|
5
|
+
<!-- 1 -->
|
6
|
+
<g id="node1" class="node">
|
7
|
+
<title>1</title>
|
8
|
+
<ellipse fill="#dddddd" stroke="black" cx="57.44" cy="-723.59" rx="57.39" ry="57.39"/>
|
9
|
+
<text text-anchor="middle" x="57.44" y="-719.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">1: StartRow</text>
|
10
|
+
</g>
|
11
|
+
<!-- 1->1 -->
|
12
|
+
<g id="edge1" class="edge">
|
13
|
+
<title>1->1</title>
|
14
|
+
<path fill="none" stroke="black" d="M34.14,-776.12C35.77,-789.23 43.54,-799.03 57.44,-799.03 67.55,-799.03 74.41,-793.86 78.03,-786"/>
|
15
|
+
<polygon fill="black" stroke="black" points="81.47,-786.69 80.75,-776.12 74.72,-784.84 81.47,-786.69"/>
|
16
|
+
<text text-anchor="middle" x="57.44" y="-803.23" font-family="Helvetica,Arial,sans-serif" font-size="14.00">eol / discard</text>
|
17
|
+
</g>
|
18
|
+
<!-- 2 -->
|
19
|
+
<g id="node2" class="node">
|
20
|
+
<title>2</title>
|
21
|
+
<ellipse fill="#dddddd" stroke="black" cx="328.63" cy="-854.59" rx="67.19" ry="67.19"/>
|
22
|
+
<text text-anchor="middle" x="328.63" y="-850.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">2: InComment</text>
|
23
|
+
</g>
|
24
|
+
<!-- 1->2 -->
|
25
|
+
<g id="edge2" class="edge">
|
26
|
+
<title>1->2</title>
|
27
|
+
<path fill="none" stroke="black" d="M84.65,-774.4C96.82,-793.08 113.04,-812.58 132.89,-824.59 168.22,-845.97 213.53,-853.86 251.55,-856.22"/>
|
28
|
+
<polygon fill="black" stroke="black" points="251.39,-859.72 261.56,-856.73 251.75,-852.73 251.39,-859.72"/>
|
29
|
+
<text text-anchor="middle" x="184.6" y="-858.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">comment / discard</text>
|
30
|
+
</g>
|
31
|
+
<!-- 3 -->
|
32
|
+
<g id="node3" class="node">
|
33
|
+
<title>3</title>
|
34
|
+
<ellipse fill="#ffdddd" stroke="black" cx="328.63" cy="-636.59" rx="70.15" ry="70.15"/>
|
35
|
+
<ellipse fill="none" stroke="black" cx="328.63" cy="-636.59" rx="74.14" ry="74.14"/>
|
36
|
+
<text text-anchor="middle" x="328.63" y="-632.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">3: StartColumn</text>
|
37
|
+
</g>
|
38
|
+
<!-- 1->3 -->
|
39
|
+
<g id="edge3" class="edge">
|
40
|
+
<title>1->3</title>
|
41
|
+
<path fill="none" stroke="black" d="M112.27,-706.19C151.07,-693.65 204.07,-676.52 247.79,-662.39"/>
|
42
|
+
<polygon fill="black" stroke="black" points="249.1,-665.65 257.54,-659.24 246.95,-658.99 249.1,-665.65"/>
|
43
|
+
<text text-anchor="middle" x="184.6" y="-701.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / ε</text>
|
44
|
+
</g>
|
45
|
+
<!-- 2->1 -->
|
46
|
+
<g id="edge4" class="edge">
|
47
|
+
<title>2->1</title>
|
48
|
+
<path fill="none" stroke="black" d="M270.63,-820.34C259.46,-814.36 247.69,-808.55 236.3,-803.79 192.13,-785.33 176.26,-793.89 132.89,-773.59 126.2,-770.46 119.43,-766.77 112.85,-762.84"/>
|
49
|
+
<polygon fill="black" stroke="black" points="114.35,-759.65 104.01,-757.36 110.66,-765.6 114.35,-759.65"/>
|
50
|
+
<text text-anchor="middle" x="184.6" y="-807.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">LF / discard</text>
|
51
|
+
</g>
|
52
|
+
<!-- 2->2 -->
|
53
|
+
<g id="edge5" class="edge">
|
54
|
+
<title>2->2</title>
|
55
|
+
<path fill="none" stroke="black" d="M302.42,-916.55C304.98,-929.96 313.72,-939.69 328.63,-939.69 339.69,-939.69 347.35,-934.33 351.62,-926.08"/>
|
56
|
+
<polygon fill="black" stroke="black" points="354.95,-927.14 354.83,-916.55 348.32,-924.91 354.95,-927.14"/>
|
57
|
+
<text text-anchor="middle" x="328.63" y="-943.89" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / discard</text>
|
58
|
+
</g>
|
59
|
+
<!-- 4 -->
|
60
|
+
<g id="node4" class="node">
|
61
|
+
<title>4</title>
|
62
|
+
<ellipse fill="#dddddd" stroke="black" cx="683.49" cy="-434.59" rx="88.61" ry="88.61"/>
|
63
|
+
<text text-anchor="middle" x="683.49" y="-430.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">4: InQuotedColumn</text>
|
64
|
+
</g>
|
65
|
+
<!-- 3->4 -->
|
66
|
+
<g id="edge6" class="edge">
|
67
|
+
<title>3->4</title>
|
68
|
+
<path fill="none" stroke="black" d="M393.53,-600.01C450.62,-567.33 534.7,-519.19 597.46,-483.27"/>
|
69
|
+
<polygon fill="black" stroke="black" points="599.23,-486.29 606.17,-478.28 595.75,-480.21 599.23,-486.29"/>
|
70
|
+
<text text-anchor="middle" x="498.94" y="-585.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">quote & @quoting / discard</text>
|
71
|
+
</g>
|
72
|
+
<!-- 7 -->
|
73
|
+
<g id="node7" class="node">
|
74
|
+
<title>7</title>
|
75
|
+
<ellipse fill="#dddddd" stroke="black" cx="1463.55" cy="-539.59" rx="60.26" ry="60.26"/>
|
76
|
+
<text text-anchor="middle" x="1463.55" y="-535.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">7: InColumn</text>
|
77
|
+
</g>
|
78
|
+
<!-- 3->7 -->
|
79
|
+
<g id="edge7" class="edge">
|
80
|
+
<title>3->7</title>
|
81
|
+
<path fill="none" stroke="black" d="M403.1,-636.36C598.59,-635.31 1128.2,-629.41 1299.77,-596.59 1333.2,-590.19 1369.05,-578.24 1398.57,-566.92"/>
|
82
|
+
<polygon fill="black" stroke="black" points="1399.87,-570.17 1407.92,-563.27 1397.33,-563.64 1399.87,-570.17"/>
|
83
|
+
<text text-anchor="middle" x="922.7" y="-633.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / copyout</text>
|
84
|
+
</g>
|
85
|
+
<!-- 8 -->
|
86
|
+
<g id="node8" class="node">
|
87
|
+
<title>8</title>
|
88
|
+
<ellipse fill="#dddddd" stroke="black" cx="1734.3" cy="-615.59" rx="104.78" ry="104.78"/>
|
89
|
+
<text text-anchor="middle" x="1734.3" y="-611.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">8: EndColumnSeparator</text>
|
90
|
+
</g>
|
91
|
+
<!-- 3->8 -->
|
92
|
+
<g id="edge8" class="edge">
|
93
|
+
<title>3->8</title>
|
94
|
+
<path fill="none" stroke="black" d="M396.5,-667.43C404.63,-670.5 412.9,-673.31 420.95,-675.59 534.03,-707.6 564.96,-714.59 682.49,-714.59 682.49,-714.59 682.49,-714.59 1464.55,-714.59 1523.06,-714.59 1584.57,-693.67 1633.95,-671.07"/>
|
95
|
+
<polygon fill="black" stroke="black" points="1635.44,-674.23 1643.03,-666.83 1632.48,-667.89 1635.44,-674.23"/>
|
96
|
+
<text text-anchor="middle" x="1186.56" y="-718.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">sep / discard</text>
|
97
|
+
</g>
|
98
|
+
<!-- 9 -->
|
99
|
+
<g id="node9" class="node">
|
100
|
+
<title>9</title>
|
101
|
+
<ellipse fill="#ffdddd" stroke="black" cx="1734.3" cy="-358.59" rx="85.77" ry="85.77"/>
|
102
|
+
<ellipse fill="none" stroke="black" cx="1734.3" cy="-358.59" rx="89.77" ry="89.77"/>
|
103
|
+
<text text-anchor="middle" x="1734.3" y="-354.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">9: EndColumnRow</text>
|
104
|
+
</g>
|
105
|
+
<!-- 3->9 -->
|
106
|
+
<g id="edge9" class="edge">
|
107
|
+
<title>3->9</title>
|
108
|
+
<path fill="none" stroke="black" d="M345.32,-563.91C379.9,-425.69 476.52,-138.59 682.49,-138.59 682.49,-138.59 682.49,-138.59 1464.55,-138.59 1554.14,-138.59 1630.26,-212.7 1678.37,-274.89"/>
|
109
|
+
<polygon fill="black" stroke="black" points="1675.84,-277.34 1684.68,-283.19 1681.42,-273.11 1675.84,-277.34"/>
|
110
|
+
<text text-anchor="middle" x="1186.56" y="-142.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">eol / ε</text>
|
111
|
+
</g>
|
112
|
+
<!-- 4->4 -->
|
113
|
+
<g id="edge10" class="edge">
|
114
|
+
<title>4->4</title>
|
115
|
+
<path fill="none" stroke="black" d="M651.28,-517.24C655.68,-531.36 666.42,-541.15 683.49,-541.15 696.56,-541.15 705.91,-535.41 711.55,-526.39"/>
|
116
|
+
<polygon fill="black" stroke="black" points="714.76,-527.8 715.69,-517.24 708.38,-524.91 714.76,-527.8"/>
|
117
|
+
<text text-anchor="middle" x="683.49" y="-545.35" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / copyout</text>
|
118
|
+
</g>
|
119
|
+
<!-- 5 -->
|
120
|
+
<g id="node5" class="node">
|
121
|
+
<title>5</title>
|
122
|
+
<ellipse fill="#dddddd" stroke="black" cx="1186.56" cy="-474.59" rx="113.42" ry="113.42"/>
|
123
|
+
<text text-anchor="middle" x="1186.56" y="-470.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">5: InDoubleEscapedQuote</text>
|
124
|
+
</g>
|
125
|
+
<!-- 4->5 -->
|
126
|
+
<g id="edge11" class="edge">
|
127
|
+
<title>4->5</title>
|
128
|
+
<path fill="none" stroke="black" d="M771.93,-427.87C847.42,-423.74 959.17,-421.61 1055.35,-435.79 1059.74,-436.44 1064.18,-437.2 1068.65,-438.05"/>
|
129
|
+
<polygon fill="black" stroke="black" points="1067.96,-441.48 1078.46,-440.06 1069.36,-434.63 1067.96,-441.48"/>
|
130
|
+
<text text-anchor="middle" x="922.7" y="-439.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">quote & @quoting / discard</text>
|
131
|
+
</g>
|
132
|
+
<!-- 6 -->
|
133
|
+
<g id="node6" class="node">
|
134
|
+
<title>6</title>
|
135
|
+
<ellipse fill="#dddddd" stroke="black" cx="1186.56" cy="-258.59" rx="84.56" ry="84.56"/>
|
136
|
+
<text text-anchor="middle" x="1186.56" y="-254.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">6: InEscapedQuote</text>
|
137
|
+
</g>
|
138
|
+
<!-- 4->6 -->
|
139
|
+
<g id="edge12" class="edge">
|
140
|
+
<title>4->6</title>
|
141
|
+
<path fill="none" stroke="black" d="M741.48,-367.37C756.03,-353.75 772.55,-340.84 790.04,-331.79 885,-282.64 1007.86,-266.1 1091.27,-260.74"/>
|
142
|
+
<polygon fill="black" stroke="black" points="1091.72,-264.22 1101.5,-260.13 1091.3,-257.23 1091.72,-264.22"/>
|
143
|
+
<text text-anchor="middle" x="922.7" y="-335.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">esc & @quoting / discard</text>
|
144
|
+
</g>
|
145
|
+
<!-- 5->4 -->
|
146
|
+
<g id="edge13" class="edge">
|
147
|
+
<title>5->4</title>
|
148
|
+
<path fill="none" stroke="black" d="M1073.18,-471.73C993.87,-468.88 885.2,-463.32 790.04,-452.59 787,-452.25 783.91,-451.87 780.8,-451.48"/>
|
149
|
+
<polygon fill="black" stroke="black" points="781.16,-447.99 770.78,-450.12 780.22,-454.93 781.16,-447.99"/>
|
150
|
+
<text text-anchor="middle" x="922.7" y="-474.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">quote & @quoting & @double-quote / copyout</text>
|
151
|
+
</g>
|
152
|
+
<!-- 5->7 -->
|
153
|
+
<g id="edge14" class="edge">
|
154
|
+
<title>5->7</title>
|
155
|
+
<path fill="none" stroke="black" d="M1297.03,-500.47C1330.14,-508.29 1365.5,-516.65 1395.01,-523.63"/>
|
156
|
+
<polygon fill="black" stroke="black" points="1394.24,-527.04 1404.78,-525.93 1395.85,-520.23 1394.24,-527.04"/>
|
157
|
+
<text text-anchor="middle" x="1351.59" y="-523.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / copyout</text>
|
158
|
+
</g>
|
159
|
+
<!-- 5->8 -->
|
160
|
+
<g id="edge15" class="edge">
|
161
|
+
<title>5->8</title>
|
162
|
+
<path fill="none" stroke="black" d="M1298.72,-458.11C1405.89,-444.39 1559.29,-431.13 1611.66,-458.59 1637.2,-471.99 1659.03,-493.39 1676.86,-516.13"/>
|
163
|
+
<polygon fill="black" stroke="black" points="1674.16,-518.37 1682.99,-524.22 1679.74,-514.14 1674.16,-518.37"/>
|
164
|
+
<text text-anchor="middle" x="1463.55" y="-450.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">sep / discard</text>
|
165
|
+
</g>
|
166
|
+
<!-- 5->9 -->
|
167
|
+
<g id="edge16" class="edge">
|
168
|
+
<title>5->9</title>
|
169
|
+
<path fill="none" stroke="black" d="M1294.02,-438.19C1302.04,-435.58 1310.03,-433.02 1317.77,-430.59 1355.58,-418.73 1364.68,-414.13 1403.42,-405.79 1480.49,-389.21 1568.97,-376.9 1634.88,-369.02"/>
|
170
|
+
<polygon fill="black" stroke="black" points="1635.32,-372.5 1644.84,-367.85 1634.5,-365.54 1635.32,-372.5"/>
|
171
|
+
<text text-anchor="middle" x="1463.55" y="-409.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">eol / ε</text>
|
172
|
+
</g>
|
173
|
+
<!-- 6->4 -->
|
174
|
+
<g id="edge17" class="edge">
|
175
|
+
<title>6->4</title>
|
176
|
+
<path fill="none" stroke="black" d="M1119.98,-311.45C1100.16,-325.3 1077.7,-339.01 1055.35,-348.59 944.65,-396.04 906.9,-374.63 790.04,-403.79 786.19,-404.75 782.27,-405.76 778.32,-406.8"/>
|
177
|
+
<polygon fill="black" stroke="black" points="777.2,-403.48 768.44,-409.44 779.01,-410.24 777.2,-403.48"/>
|
178
|
+
<text text-anchor="middle" x="922.7" y="-407.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / copyout</text>
|
179
|
+
</g>
|
180
|
+
<!-- 7->7 -->
|
181
|
+
<g id="edge18" class="edge">
|
182
|
+
<title>7->7</title>
|
183
|
+
<path fill="none" stroke="black" d="M1443.53,-596.47C1445.49,-608.79 1452.16,-617.72 1463.55,-617.72 1471.56,-617.72 1477.23,-613.3 1480.58,-606.4"/>
|
184
|
+
<polygon fill="black" stroke="black" points="1484.03,-607.06 1483.57,-596.47 1477.33,-605.04 1484.03,-607.06"/>
|
185
|
+
<text text-anchor="middle" x="1463.55" y="-621.92" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / copyout</text>
|
186
|
+
</g>
|
187
|
+
<!-- 7->8 -->
|
188
|
+
<g id="edge19" class="edge">
|
189
|
+
<title>7->8</title>
|
190
|
+
<path fill="none" stroke="black" d="M1521.54,-555.71C1551.26,-564.11 1588.64,-574.68 1623.68,-584.59"/>
|
191
|
+
<polygon fill="black" stroke="black" points="1622.92,-588.01 1633.5,-587.37 1624.83,-581.28 1622.92,-588.01"/>
|
192
|
+
<text text-anchor="middle" x="1576.67" y="-583.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">sep / discard</text>
|
193
|
+
</g>
|
194
|
+
<!-- 7->9 -->
|
195
|
+
<g id="edge20" class="edge">
|
196
|
+
<title>7->9</title>
|
197
|
+
<path fill="none" stroke="black" d="M1513.84,-506.39C1552.07,-480.65 1605.89,-444.4 1650.67,-414.24"/>
|
198
|
+
<polygon fill="black" stroke="black" points="1652.87,-416.98 1659.21,-408.49 1648.96,-411.17 1652.87,-416.98"/>
|
199
|
+
<text text-anchor="middle" x="1576.67" y="-489.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">eol / ε</text>
|
200
|
+
</g>
|
201
|
+
<!-- 8->3 -->
|
202
|
+
<g id="edge21" class="edge">
|
203
|
+
<title>8->3</title>
|
204
|
+
<path fill="none" stroke="black" d="M1632.54,-641.15C1582.39,-651.73 1520.64,-661.59 1464.55,-661.59 682.49,-661.59 682.49,-661.59 682.49,-661.59 566.08,-661.59 536.92,-657.67 420.95,-647.59 418.28,-647.36 415.57,-647.11 412.84,-646.85"/>
|
205
|
+
<polygon fill="black" stroke="black" points="413.15,-643.36 402.85,-645.84 412.45,-650.33 413.15,-643.36"/>
|
206
|
+
<text text-anchor="middle" x="1186.56" y="-665.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / ε</text>
|
207
|
+
</g>
|
208
|
+
<!-- 10 -->
|
209
|
+
<g id="node10" class="node">
|
210
|
+
<title>10</title>
|
211
|
+
<ellipse fill="#ffdddd" stroke="black" cx="1971.46" cy="-292.59" rx="67.76" ry="67.76"/>
|
212
|
+
<ellipse fill="none" stroke="black" cx="1971.46" cy="-292.59" rx="71.77" ry="71.77"/>
|
213
|
+
<text text-anchor="middle" x="1971.46" y="-288.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">10: InRowEnd</text>
|
214
|
+
</g>
|
215
|
+
<!-- 9->10 -->
|
216
|
+
<g id="edge22" class="edge">
|
217
|
+
<title>9->10</title>
|
218
|
+
<path fill="none" stroke="black" d="M1821.19,-334.48C1844.5,-327.94 1869.6,-320.9 1892.44,-314.49"/>
|
219
|
+
<polygon fill="black" stroke="black" points="1893.4,-317.85 1902.09,-311.78 1891.51,-311.11 1893.4,-317.85"/>
|
220
|
+
<text text-anchor="middle" x="1869.38" y="-327.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / ε</text>
|
221
|
+
</g>
|
222
|
+
<!-- 11 -->
|
223
|
+
<g id="node11" class="node">
|
224
|
+
<title>11</title>
|
225
|
+
<ellipse fill="#dddddd" stroke="black" cx="2283.81" cy="-292.59" rx="49.89" ry="49.89"/>
|
226
|
+
<text text-anchor="middle" x="2283.81" y="-288.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">11: CRLF</text>
|
227
|
+
</g>
|
228
|
+
<!-- 10->11 -->
|
229
|
+
<g id="edge23" class="edge">
|
230
|
+
<title>10->11</title>
|
231
|
+
<path fill="none" stroke="black" d="M2043.27,-292.59C2097.67,-292.59 2171.93,-292.59 2223.29,-292.59"/>
|
232
|
+
<polygon fill="black" stroke="black" points="2223.56,-296.09 2233.56,-292.59 2223.56,-289.09 2223.56,-296.09"/>
|
233
|
+
<text text-anchor="middle" x="2133.66" y="-296.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">CR & @isCRLF / discard</text>
|
234
|
+
</g>
|
235
|
+
<!-- 12 -->
|
236
|
+
<g id="node12" class="node">
|
237
|
+
<title>12</title>
|
238
|
+
<ellipse fill="#dddddd" stroke="black" cx="2283.81" cy="-59.59" rx="59.68" ry="59.68"/>
|
239
|
+
<text text-anchor="middle" x="2283.81" y="-55.39" font-family="Helvetica,Arial,sans-serif" font-size="14.00">12: EndRow</text>
|
240
|
+
</g>
|
241
|
+
<!-- 10->12 -->
|
242
|
+
<g id="edge24" class="edge">
|
243
|
+
<title>10->12</title>
|
244
|
+
<path fill="none" stroke="black" d="M2029.38,-249.85C2085.76,-207.53 2171.52,-143.14 2227.38,-101.21"/>
|
245
|
+
<polygon fill="black" stroke="black" points="2229.5,-103.99 2235.39,-95.19 2225.29,-98.4 2229.5,-103.99"/>
|
246
|
+
<text text-anchor="middle" x="2133.66" y="-227.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / discard</text>
|
247
|
+
</g>
|
248
|
+
<!-- 11->1 -->
|
249
|
+
<g id="edge25" class="edge">
|
250
|
+
<title>11->1</title>
|
251
|
+
<path fill="none" stroke="black" d="M2274.36,-341.68C2250.6,-461.58 2171.54,-759.59 1972.46,-759.59 327.63,-759.59 327.63,-759.59 327.63,-759.59 257.49,-759.59 177.99,-747.44 123.67,-737.25"/>
|
252
|
+
<polygon fill="black" stroke="black" points="124.31,-733.81 113.83,-735.38 123,-740.69 124.31,-733.81"/>
|
253
|
+
<text text-anchor="middle" x="1351.59" y="-763.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / ε</text>
|
254
|
+
</g>
|
255
|
+
<!-- 11->1 -->
|
256
|
+
<g id="edge26" class="edge">
|
257
|
+
<title>11->1</title>
|
258
|
+
<path fill="none" stroke="black" d="M2274.91,-243.27C2265.69,-202.96 2246.26,-147.99 2206.22,-118.59 2121.97,-56.71 2077,-92.59 1972.46,-92.59 327.63,-92.59 327.63,-92.59 327.63,-92.59 205.79,-92.59 106.77,-495.61 71.91,-657.47"/>
|
259
|
+
<polygon fill="black" stroke="black" points="68.47,-656.8 69.81,-667.32 75.32,-658.27 68.47,-656.8"/>
|
260
|
+
<text text-anchor="middle" x="1351.59" y="-96.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">LF / discard</text>
|
261
|
+
</g>
|
262
|
+
<!-- 12->1 -->
|
263
|
+
<g id="edge27" class="edge">
|
264
|
+
<title>12->1</title>
|
265
|
+
<path fill="none" stroke="black" d="M2224.19,-57.48C2161.79,-55.42 2060.17,-52.59 1972.46,-52.59 327.63,-52.59 327.63,-52.59 327.63,-52.59 227.11,-52.59 186.58,-82.62 132.89,-167.59 82.43,-247.45 65.7,-525.56 60.55,-655.6"/>
|
266
|
+
<polygon fill="black" stroke="black" points="57.04,-655.7 60.16,-665.83 64.04,-655.97 57.04,-655.7"/>
|
267
|
+
<text text-anchor="middle" x="1351.59" y="-56.79" font-family="Helvetica,Arial,sans-serif" font-size="14.00">* / ε</text>
|
268
|
+
</g>
|
269
|
+
</g>
|
270
|
+
</svg>
|
data/diagram/diagram.dot
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
digraph csv {
|
2
|
+
rankdir=LR;
|
3
|
+
node [ shape = point ];
|
4
|
+
ENTRY;
|
5
|
+
en_4;
|
6
|
+
eof_5;
|
7
|
+
node [ shape = circle, height = 0.2 ];
|
8
|
+
node [ fixedsize = true, height = 0.65, shape = doublecircle ];
|
9
|
+
4;
|
10
|
+
5;
|
11
|
+
node [ shape = circle ];
|
12
|
+
1 -> 2 [ label = "'?'" ];
|
13
|
+
2 -> 4 [ label = "'\\n' / last4, initts" ];
|
14
|
+
3 -> 3 [ label = "-128..-1, 1..'!', '#'..127" ];
|
15
|
+
3 -> 4 [ label = "'\"' / last2, initts" ];
|
16
|
+
4 -> 5 [ label = "-128..-1, 1..'\\t', '\\v'..'\\f', 14..'!', '#'..'+', '-'..127 / ts" ];
|
17
|
+
4 -> 4 [ label = "0 / ts, last5, initts" ];
|
18
|
+
4 -> 1 [ label = "'\\r' / ts" ];
|
19
|
+
4 -> 3 [ label = "'\"' / ts" ];
|
20
|
+
4 -> 4 [ label = "',' / ts, last3, initts" ];
|
21
|
+
5 -> 4 [ label = "0, '\\n', '\\r', '\"', ',' / next1, initts" ];
|
22
|
+
5 -> 5 [ label = "DEF" ];
|
23
|
+
ENTRY -> 4 [ label = "IN" ];
|
24
|
+
en_4 -> 4 [ label = "csv_scan" ];
|
25
|
+
5 -> eof_5 [ label = "EOF / next1" ];
|
26
|
+
}
|
data/diagram/diagram.rl
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
%%{
|
2
|
+
machine csv;
|
3
|
+
|
4
|
+
variable p s->p;
|
5
|
+
variable pe s->pe;
|
6
|
+
variable eof s->eof;
|
7
|
+
access s->;
|
8
|
+
|
9
|
+
eol = [\r\n];
|
10
|
+
comment = '#';
|
11
|
+
CR = "\r";
|
12
|
+
LF = "\n";
|
13
|
+
|
14
|
+
EOF = 0;
|
15
|
+
EOL = /\r?\n/;
|
16
|
+
comma = [,];
|
17
|
+
string = [^,"\r\n\0]*;
|
18
|
+
quote = '"' [^"\0]* '"';
|
19
|
+
|
20
|
+
csv_scan := |*
|
21
|
+
|
22
|
+
string => {
|
23
|
+
return_token(TK_String);
|
24
|
+
fbreak;
|
25
|
+
};
|
26
|
+
|
27
|
+
quote => {
|
28
|
+
return_token(TK_Quote);
|
29
|
+
s->data += 1;
|
30
|
+
fbreak;
|
31
|
+
};
|
32
|
+
|
33
|
+
comma => {
|
34
|
+
return_token(TK_Comma);
|
35
|
+
fbreak;
|
36
|
+
};
|
37
|
+
|
38
|
+
EOL => {
|
39
|
+
s->curline += 1;
|
40
|
+
return_token(TK_EOL);
|
41
|
+
fbreak;
|
42
|
+
};
|
43
|
+
|
44
|
+
EOF => {
|
45
|
+
return_token(TK_EOF);
|
46
|
+
fbreak;
|
47
|
+
};
|
48
|
+
|
49
|
+
*|;
|
50
|
+
}%%
|
data/lib/censive.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# censive - A quick and lightweight CSV handling library for Ruby
|
5
5
|
#
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
|
-
# Date: Feb
|
7
|
+
# Date: Feb 14, 2023
|
8
8
|
#
|
9
9
|
# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
10
|
# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
@@ -14,14 +14,22 @@
|
|
14
14
|
# GOALS:
|
15
15
|
# 1. Faster than Ruby's default CSV library
|
16
16
|
# 2. Lightweight code with streamlined and optimized logic
|
17
|
-
# 3. Support most non-compliant CSV variations (
|
17
|
+
# 3. Support most non-compliant CSV variations (@excel, @relax, etc)
|
18
|
+
# 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
|
18
19
|
#
|
19
|
-
# TODO:
|
20
|
+
# TODO:
|
21
|
+
# 1. Support IO streaming
|
22
|
+
# 2. Review all encodings, we may be losing speed when mixing encodings
|
23
|
+
# 3. Speedup possible if our @unquoted regex reads beyond @eol's
|
24
|
+
# 4. Will using String#freeze give us a speed up?
|
25
|
+
# 5. Implement support for scan_until(string) <= right now only regex is valid
|
20
26
|
# ============================================================================
|
21
27
|
|
22
28
|
require "strscan"
|
23
29
|
|
24
30
|
class Censive < StringScanner
|
31
|
+
attr :encoding, :out
|
32
|
+
|
25
33
|
def self.parse(...)
|
26
34
|
new(...).parse
|
27
35
|
end
|
@@ -34,78 +42,73 @@ class Censive < StringScanner
|
|
34
42
|
end
|
35
43
|
end
|
36
44
|
|
37
|
-
def initialize(str=
|
38
|
-
drop: false , # drop trailing empty
|
39
|
-
encoding:
|
45
|
+
def initialize(str=nil,
|
46
|
+
drop: false , # drop trailing empty columns?
|
47
|
+
encoding: nil , # character encoding
|
40
48
|
excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
|
41
49
|
mode: :compact, # export mode: compact or full
|
42
|
-
out:
|
50
|
+
out: nil , # output stream, needs to respond to <<
|
43
51
|
quote: '"' , # quote character
|
44
52
|
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
45
53
|
rowsep: "\n" , # row separator for export
|
46
54
|
sep: "," , # column separator character
|
47
|
-
strip: false , # strip
|
48
|
-
**opts
|
55
|
+
strip: false , # strip columns when reading
|
56
|
+
**opts # grab bag
|
49
57
|
)
|
50
|
-
# data source
|
51
|
-
str
|
58
|
+
# initialize data source
|
59
|
+
if str && str.size < 100 && File.readable?(str)
|
60
|
+
str = File.open(str, encoding ? "r:#{encoding}" : "r").read
|
61
|
+
else
|
62
|
+
str ||= ""
|
63
|
+
str = str.encode(encoding) if encoding
|
64
|
+
end
|
52
65
|
super(str)
|
53
66
|
reset
|
54
67
|
|
55
|
-
# options
|
68
|
+
# config options
|
69
|
+
@cheat = true
|
56
70
|
@drop = drop
|
71
|
+
@encoding = str.encoding
|
57
72
|
@excel = excel
|
58
73
|
@mode = mode
|
59
|
-
@out = out
|
60
|
-
@quote = quote
|
74
|
+
@out = out || $stdout
|
61
75
|
@relax = relax
|
76
|
+
@strip = strip
|
77
|
+
|
78
|
+
# config strings
|
79
|
+
@quote = quote
|
62
80
|
@rowsep = rowsep
|
63
81
|
@sep = sep
|
64
|
-
@strip = strip
|
65
82
|
|
66
|
-
#
|
67
|
-
@cr
|
68
|
-
@lf
|
69
|
-
@es
|
70
|
-
@eq
|
71
|
-
|
72
|
-
|
73
|
-
@
|
83
|
+
# static strings
|
84
|
+
@cr = "\r"
|
85
|
+
@lf = "\n"
|
86
|
+
@es = ""
|
87
|
+
@eq = "="
|
88
|
+
|
89
|
+
# combinations
|
90
|
+
@esc = (@quote * 2)
|
91
|
+
@seq = [@sep, @eq].join # used for parsing in excel mode
|
92
|
+
|
93
|
+
# regexes
|
94
|
+
@eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
|
95
|
+
@eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
|
96
|
+
@escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
|
97
|
+
@quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
|
98
|
+
@quotes = /#{@quote}/o
|
99
|
+
@seps = /#{@sep}+/o
|
100
|
+
@quoted = @excel ? /(?:=)?#{@quote}/o : @quote
|
101
|
+
@unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
|
102
|
+
@leadzero = /\A0\d*\z/
|
74
103
|
end
|
75
104
|
|
76
105
|
def reset(str=nil)
|
77
|
-
self.string = str if str
|
78
|
-
super()
|
79
106
|
@rows = nil
|
80
107
|
@cols = @cells = 0
|
81
|
-
end
|
82
|
-
|
83
|
-
# ==[ Lexer ]==
|
84
108
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
if scan(@quote) # consume quoted cell
|
89
|
-
token = ""
|
90
|
-
while true
|
91
|
-
token << (scan_until(/#{@quote}/o) or bomb "unclosed quote")[0..-2]
|
92
|
-
token << @quote and next if scan(@quote)
|
93
|
-
break if scan(@eoc)
|
94
|
-
@relax or bomb "invalid character after quote"
|
95
|
-
token << @quote + (scan_until(/#{@quote}/o) or bomb "bad inline quote")
|
96
|
-
end
|
97
|
-
elsif scan(@sep) then return @es
|
98
|
-
elsif scan(@eol) then return nil
|
99
|
-
else # consume unquoted cell
|
100
|
-
token = scan_until(@eoc) or bomb "unexpected character"
|
101
|
-
token.prepend(@eq) if excel
|
102
|
-
end
|
103
|
-
scan(@sep)
|
104
|
-
@strip ? token.strip : token
|
105
|
-
end
|
106
|
-
|
107
|
-
def bomb(msg)
|
108
|
-
abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
|
109
|
+
self.string = str if str
|
110
|
+
@encoding = string.encoding
|
111
|
+
super()
|
109
112
|
end
|
110
113
|
|
111
114
|
# ==[ Parser ]==
|
@@ -122,18 +125,72 @@ class Censive < StringScanner
|
|
122
125
|
end
|
123
126
|
|
124
127
|
def next_row
|
128
|
+
if @cheat and line = scan_until(@eol)
|
129
|
+
row = line.chomp!.split(@sep, -1)
|
130
|
+
row.each do |col|
|
131
|
+
next if (saw = col.count(@quote)).zero?
|
132
|
+
next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
|
133
|
+
@cheat = false
|
134
|
+
break
|
135
|
+
end if line.include?(@quote)
|
136
|
+
@cheat and return @strip ? row.each(&:strip!) : row
|
137
|
+
unscan
|
138
|
+
end
|
139
|
+
|
125
140
|
token = next_token or return
|
126
|
-
row = [
|
127
|
-
row
|
141
|
+
row = []
|
142
|
+
row.push(*token)
|
143
|
+
row.push(*token) while token = next_token
|
128
144
|
row
|
129
145
|
end
|
130
146
|
|
147
|
+
def next_token
|
148
|
+
if scan(@quoted) # quoted cell
|
149
|
+
token = ""
|
150
|
+
while true
|
151
|
+
token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
|
152
|
+
token << @quote and next if scan(@quote)
|
153
|
+
scan(@eoc) and break
|
154
|
+
@relax or bomb "invalid character after quote"
|
155
|
+
token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
|
156
|
+
end
|
157
|
+
scan(@sep)
|
158
|
+
@strip ? token.strip : token
|
159
|
+
elsif match = scan(@unquoted) # unquoted cell(s)
|
160
|
+
if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
|
161
|
+
unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
|
162
|
+
match << (scan_until(@eoc) or bomb "stray quote")
|
163
|
+
scan(@sep)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
tokens = match.split(@sep, -1)
|
167
|
+
@strip ? tokens.map!(&:strip) : tokens
|
168
|
+
elsif scan(@sep)
|
169
|
+
match = scan(@seps)
|
170
|
+
match ? match.split(@sep, -1) : @es
|
171
|
+
else
|
172
|
+
scan(@eol)
|
173
|
+
nil
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def each
|
178
|
+
@rows ||= parse
|
179
|
+
@rows.each {|row| yield row }
|
180
|
+
end
|
181
|
+
|
182
|
+
def export(**opts)
|
183
|
+
out = opts.empty? ? self : self.class.writer(**opts)
|
184
|
+
each {|row| out << row }
|
185
|
+
out.out
|
186
|
+
end
|
187
|
+
|
131
188
|
# ==[ Helpers ]==
|
132
189
|
|
133
190
|
# returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
134
191
|
def grok(str)
|
135
|
-
if idx = str.index(
|
136
|
-
$1 ? 2 : str.index(
|
192
|
+
if idx = str.index(@escapes)
|
193
|
+
$1 ? 2 : str.index(@quotes, idx) ? 2 : 1
|
137
194
|
else
|
138
195
|
0
|
139
196
|
end
|
@@ -153,11 +210,11 @@ class Censive < StringScanner
|
|
153
210
|
row
|
154
211
|
when 1
|
155
212
|
row.map do |col|
|
156
|
-
col.match?(
|
213
|
+
col.match?(@quotable) ? "#{q}#{col}#{q}" : col
|
157
214
|
end
|
158
215
|
else
|
159
216
|
row.map do |col|
|
160
|
-
@excel && col =~
|
217
|
+
@excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
|
161
218
|
case grok(col)
|
162
219
|
when 0 then col
|
163
220
|
when 1 then "#{q}#{col}#{q}"
|
@@ -168,7 +225,7 @@ class Censive < StringScanner
|
|
168
225
|
when :full
|
169
226
|
if @excel
|
170
227
|
row.map do |col|
|
171
|
-
col =~
|
228
|
+
col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
|
172
229
|
end
|
173
230
|
else
|
174
231
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
@@ -178,16 +235,6 @@ class Censive < StringScanner
|
|
178
235
|
@out << out + @rowsep
|
179
236
|
end
|
180
237
|
|
181
|
-
def each
|
182
|
-
@rows ||= parse
|
183
|
-
@rows.each {|row| yield row }
|
184
|
-
end
|
185
|
-
|
186
|
-
def export(**opts)
|
187
|
-
out = opts.empty? ? self : self.class.writer(**opts)
|
188
|
-
each {|row| out << row }
|
189
|
-
end
|
190
|
-
|
191
238
|
def stats
|
192
239
|
wide = string.size.to_s.size
|
193
240
|
puts "%#{wide}d rows" % @rows.size
|
@@ -195,27 +242,38 @@ class Censive < StringScanner
|
|
195
242
|
puts "%#{wide}d cells" % @cells
|
196
243
|
puts "%#{wide}d bytes" % string.size
|
197
244
|
end
|
245
|
+
|
246
|
+
def bomb(msg)
|
247
|
+
abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
|
248
|
+
end
|
198
249
|
end
|
199
250
|
|
200
251
|
if __FILE__ == $0
|
201
|
-
|
202
|
-
#
|
203
|
-
|
204
|
-
|
252
|
+
str = DATA.gets("\n\n").chomp
|
253
|
+
# str = File.read(ARGV.first || "lc-2023.csv")
|
254
|
+
# str = File.open("KEN_ALL.CSV", "r:cp932").read
|
255
|
+
|
256
|
+
# require "stringio"
|
257
|
+
# csv = Censive.new(str, excel: true, relax: true)
|
258
|
+
# out = "" # StringIO.new
|
259
|
+
# csv.export(out: out) # (excel: true) # sep: "|")
|
260
|
+
# puts out # .string
|
261
|
+
|
262
|
+
puts Censive.new(str, excel: true, relax: true, out: "").export
|
205
263
|
end
|
206
264
|
|
207
265
|
__END__
|
208
|
-
|
209
|
-
|
266
|
+
"Don",="007",10,"Ed"
|
267
|
+
Name,Age,,,Shoe,,,
|
268
|
+
"Alice",27,5
|
210
269
|
Bob,33,10 1/2
|
211
270
|
Charlie or "Chuck",=B2 + B3,9
|
212
|
-
"Doug E Fresh",="007",10
|
213
271
|
Subtotal,=sum(B2:B5),="01234"
|
214
|
-
|
215
|
-
|
272
|
+
A,B,C,D
|
273
|
+
A,B,"C",D
|
274
|
+
A,B,C",D
|
275
|
+
A,B,"C",D
|
216
276
|
123,"CHO, JOELLE "JOJO"",456
|
217
277
|
123,"CHO, JOELLE ""JOJO""",456
|
218
|
-
|
219
|
-
# Excel mode checking
|
220
278
|
=,=x,x=,="x",="","","=",123,0123,="123",="0123"
|
221
|
-
,=x,x=,x,,,,,,=,,123,="0123",123,,="0123"
|
279
|
+
,=x,x=,x,,,,,,=,,123,="0123",123,,="0123"
|
data/lib/test-censive.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "./censive"
|
4
|
+
require "digest/md5"
|
5
|
+
|
6
|
+
path = ARGV[0] || "KEN_ALL.CSV"
|
7
|
+
mode = path =~ /^ken/i ? "r:cp932" : "r"
|
8
|
+
|
9
|
+
data = File.open(path, mode).read
|
10
|
+
rows = Censive.parse(data)
|
11
|
+
|
12
|
+
puts "%s %s (%d size)" % [Digest::MD5.hexdigest(rows.join), path, File.stat(path).size], ""
|
data/lib/test-csv.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
require "digest/md5"
|
5
|
+
|
6
|
+
path = ARGV[0] || "KEN_ALL.CSV"
|
7
|
+
mode = path =~ /^ken/i ? "r:cp932" : "r"
|
8
|
+
|
9
|
+
data = File.open(path, mode).read
|
10
|
+
rows = CSV.parse(data)
|
11
|
+
|
12
|
+
puts "%s %s (%d size)" % [Digest::MD5.hexdigest(rows.join), path, File.stat(path).size], ""
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: censive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.21'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steve Shreeve
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-02-
|
11
|
+
date: 2023-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A quick and lightweight CSV handling library for Ruby
|
14
14
|
email: steve.shreeve@gmail.com
|
@@ -19,7 +19,18 @@ files:
|
|
19
19
|
- LICENSE
|
20
20
|
- README.md
|
21
21
|
- censive.gemspec
|
22
|
+
- diagram/NFA to Regex.pdf
|
23
|
+
- diagram/censive@ce9d51d.png
|
24
|
+
- diagram/csv-ragel.dot
|
25
|
+
- diagram/csv.dot
|
26
|
+
- diagram/csv.png
|
27
|
+
- diagram/csv.rl
|
28
|
+
- diagram/csv.svg
|
29
|
+
- diagram/diagram.dot
|
30
|
+
- diagram/diagram.rl
|
22
31
|
- lib/censive.rb
|
32
|
+
- lib/test-censive.rb
|
33
|
+
- lib/test-csv.rb
|
23
34
|
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
|
24
35
|
homepage: https://github.com/shreeve/censive
|
25
36
|
licenses:
|