censive 0.7 → 0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/censive.gemspec +1 -1
 - data/lib/censive.rb +57 -108
 - metadata +1 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 6419efcdc9274ea8bcf7b8527001e33f8bdfea348dfd911686cab36984d507da
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 3b59aead54517fd64d7ece3eaa6f459e301e1e48f1ae34772a7128c61fb739f2
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 7910c09e76a81ed27870ea52fb6c8aea0316ed213c53a026d98adc64f93349477e6acab0a93b88c6f184ce1d317634ecdca9290d50bff9b117b98bedd3ac7b86
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 358ab985947d486b5f486b1f7e9c1f591e3b8e906b9eab59a4ed151e5f5d9652c211f2d2a4ee36f0543227e2ae5e33ba57f1e4c178f6f7e72e05c14d7b46895f
         
     | 
    
        data/censive.gemspec
    CHANGED
    
    
    
        data/lib/censive.rb
    CHANGED
    
    | 
         @@ -5,6 +5,8 @@ 
     | 
|
| 
       5 
5 
     | 
    
         
             
            #
         
     | 
| 
       6 
6 
     | 
    
         
             
            # Author: Steve Shreeve (steve.shreeve@gmail.com)
         
     | 
| 
       7 
7 
     | 
    
         
             
            #   Date: Jan 30, 2023
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # Thanks: Crystal's CSV library, see https://crystal-lang.org/api/1.7.2/CSV.html
         
     | 
| 
       8 
10 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       9 
11 
     | 
    
         
             
            # The goals are:
         
     | 
| 
       10 
12 
     | 
    
         
             
            #
         
     | 
| 
         @@ -15,10 +17,7 @@ 
     | 
|
| 
       15 
17 
     | 
    
         
             
            #
         
     | 
| 
       16 
18 
     | 
    
         
             
            # 1. Option to support IO streaming
         
     | 
| 
       17 
19 
     | 
    
         
             
            # 2. Option to strip whitespace
         
     | 
| 
       18 
     | 
    
         
            -
            # 3.  
     | 
| 
       19 
     | 
    
         
            -
            # 4. Option to force quotes in output
         
     | 
| 
       20 
     | 
    
         
            -
            # 5. Option to allow reading excel CSV (="Text" for cells)
         
     | 
| 
       21 
     | 
    
         
            -
            # 6. Confirm file encodings such as UTF-8, UTF-16, etc.
         
     | 
| 
      
 20 
     | 
    
         
            +
            # 3. Confirm file encodings such as UTF-8, UTF-16, etc.
         
     | 
| 
       22 
21 
     | 
    
         
             
            #
         
     | 
| 
       23 
22 
     | 
    
         
             
            # NOTE: Only getch and scan_until advance strscan's position
         
     | 
| 
       24 
23 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
         @@ -39,6 +38,7 @@ class Censive < StringScanner 
     | 
|
| 
       39 
38 
     | 
    
         | 
| 
       40 
39 
     | 
    
         
             
                drop:  false   , # enable to drop trailing separators
         
     | 
| 
       41 
40 
     | 
    
         
             
                eol:   "\n"    , # desired line endings for exports
         
     | 
| 
      
 41 
     | 
    
         
            +
                excel: false   , # allow ,="0123" style columns
         
     | 
| 
       42 
42 
     | 
    
         
             
                mode:  :compact, # export mode: compact or full
         
     | 
| 
       43 
43 
     | 
    
         
             
                out:   nil     , # output IO/file
         
     | 
| 
       44 
44 
     | 
    
         
             
                relax: false   , # relax parsing of quotes
         
     | 
| 
         @@ -48,56 +48,61 @@ class Censive < StringScanner 
     | 
|
| 
       48 
48 
     | 
    
         
             
                super(str || '')
         
     | 
| 
       49 
49 
     | 
    
         
             
                reset
         
     | 
| 
       50 
50 
     | 
    
         | 
| 
       51 
     | 
    
         
            -
                @sep 
     | 
| 
       52 
     | 
    
         
            -
                @quote 
     | 
| 
      
 51 
     | 
    
         
            +
                @sep    = sep  .freeze
         
     | 
| 
      
 52 
     | 
    
         
            +
                @quote  = quote.freeze
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                @drop   = drop
         
     | 
| 
      
 55 
     | 
    
         
            +
                @eol    = eol.freeze
         
     | 
| 
      
 56 
     | 
    
         
            +
                @mode   = mode
         
     | 
| 
      
 57 
     | 
    
         
            +
                @out    = out
         
     | 
| 
      
 58 
     | 
    
         
            +
                @relax  = relax
         
     | 
| 
       53 
59 
     | 
    
         | 
| 
       54 
     | 
    
         
            -
                @ 
     | 
| 
       55 
     | 
    
         
            -
                @ 
     | 
| 
       56 
     | 
    
         
            -
                @ 
     | 
| 
       57 
     | 
    
         
            -
                @ 
     | 
| 
       58 
     | 
    
         
            -
                @ 
     | 
| 
      
 60 
     | 
    
         
            +
                @es     = ""   .freeze
         
     | 
| 
      
 61 
     | 
    
         
            +
                @cr     = "\r" .freeze
         
     | 
| 
      
 62 
     | 
    
         
            +
                @lf     = "\n" .freeze
         
     | 
| 
      
 63 
     | 
    
         
            +
                @eq     = "="  .freeze
         
     | 
| 
      
 64 
     | 
    
         
            +
                @esc    = (@quote * 2).freeze
         
     | 
| 
       59 
65 
     | 
    
         | 
| 
       60 
     | 
    
         
            -
                @ 
     | 
| 
       61 
     | 
    
         
            -
                @ 
     | 
| 
       62 
     | 
    
         
            -
                @lf    = "\n" .freeze
         
     | 
| 
       63 
     | 
    
         
            -
                @esc   = (@quote * 2).freeze
         
     | 
| 
      
 66 
     | 
    
         
            +
                @tokens = [@sep,@quote,@cr,@lf,@es,nil]
         
     | 
| 
      
 67 
     | 
    
         
            +
                @tokens << @eq if excel # See http://bit.ly/3Y7jIvc
         
     | 
| 
       64 
68 
     | 
    
         
             
              end
         
     | 
| 
       65 
69 
     | 
    
         | 
| 
       66 
70 
     | 
    
         
             
              def reset(str=nil)
         
     | 
| 
       67 
71 
     | 
    
         
             
                self.string = str if str
         
     | 
| 
       68 
72 
     | 
    
         
             
                super()
         
     | 
| 
       69 
     | 
    
         
            -
                @char 
     | 
| 
       70 
     | 
    
         
            -
                @flag 
     | 
| 
      
 73 
     | 
    
         
            +
                @char = peek(1)
         
     | 
| 
      
 74 
     | 
    
         
            +
                @flag = nil
         
     | 
| 
       71 
75 
     | 
    
         | 
| 
       72 
     | 
    
         
            -
                @rows 
     | 
| 
       73 
     | 
    
         
            -
                @cols 
     | 
| 
      
 76 
     | 
    
         
            +
                @rows = nil
         
     | 
| 
      
 77 
     | 
    
         
            +
                @cols = @cells = 0
         
     | 
| 
       74 
78 
     | 
    
         
             
              end
         
     | 
| 
       75 
79 
     | 
    
         | 
| 
       76 
80 
     | 
    
         
             
              # ==[ Lexer ]==
         
     | 
| 
       77 
81 
     | 
    
         | 
| 
       78 
82 
     | 
    
         
             
              def next_char
         
     | 
| 
       79 
83 
     | 
    
         
             
                getch
         
     | 
| 
       80 
     | 
    
         
            -
                @char =  
     | 
| 
      
 84 
     | 
    
         
            +
                @char = peek(1)
         
     | 
| 
       81 
85 
     | 
    
         
             
              end
         
     | 
| 
       82 
86 
     | 
    
         | 
| 
       83 
87 
     | 
    
         
             
              def next_token
         
     | 
| 
       84 
88 
     | 
    
         
             
                case @flag
         
     | 
| 
       85 
     | 
    
         
            -
                when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
         
     | 
| 
      
 89 
     | 
    
         
            +
                when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
         
     | 
| 
       86 
90 
     | 
    
         
             
                when @cr then @flag = nil; next_char == @lf and next_char
         
     | 
| 
       87 
91 
     | 
    
         
             
                when @lf then @flag = nil; next_char
         
     | 
| 
       88 
92 
     | 
    
         
             
                end if @flag
         
     | 
| 
       89 
93 
     | 
    
         | 
| 
       90 
     | 
    
         
            -
                if  
     | 
| 
      
 94 
     | 
    
         
            +
                if @tokens.include?(@char)
         
     | 
| 
       91 
95 
     | 
    
         
             
                  case @char
         
     | 
| 
       92 
     | 
    
         
            -
                  when @quote #  
     | 
| 
      
 96 
     | 
    
         
            +
                  when @quote, @eq # consume quoted cell
         
     | 
| 
      
 97 
     | 
    
         
            +
                    @char == @eq and next_char # excel mode: allows ,="012",
         
     | 
| 
       93 
98 
     | 
    
         
             
                    match = ""
         
     | 
| 
       94 
99 
     | 
    
         
             
                    while true
         
     | 
| 
       95 
100 
     | 
    
         
             
                      getch # consume the quote (optimized by not calling next_char)
         
     | 
| 
       96 
101 
     | 
    
         
             
                      match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
         
     | 
| 
       97 
102 
     | 
    
         
             
                      case next_char
         
     | 
| 
       98 
     | 
    
         
            -
                      when @sep 
     | 
| 
       99 
     | 
    
         
            -
                      when @quote 
     | 
| 
       100 
     | 
    
         
            -
                      when @cr,@lf,nil then break
         
     | 
| 
      
 103 
     | 
    
         
            +
                      when @sep            then @flag = @es; next_char; break
         
     | 
| 
      
 104 
     | 
    
         
            +
                      when @quote          then match << @quote
         
     | 
| 
      
 105 
     | 
    
         
            +
                      when @cr,@lf,@es,nil then break
         
     | 
| 
       101 
106 
     | 
    
         
             
                      else
         
     | 
| 
       102 
107 
     | 
    
         
             
                        if @relax
         
     | 
| 
       103 
108 
     | 
    
         
             
                          match << @quote + @char
         
     | 
| 
         @@ -107,14 +112,14 @@ class Censive < StringScanner 
     | 
|
| 
       107 
112 
     | 
    
         
             
                      end
         
     | 
| 
       108 
113 
     | 
    
         
             
                    end
         
     | 
| 
       109 
114 
     | 
    
         
             
                    match
         
     | 
| 
       110 
     | 
    
         
            -
                  when @sep 
     | 
| 
       111 
     | 
    
         
            -
                  when @cr 
     | 
| 
       112 
     | 
    
         
            -
                  when @lf 
     | 
| 
       113 
     | 
    
         
            -
                  when nil 
     | 
| 
      
 115 
     | 
    
         
            +
                  when @sep    then @flag = @es; next_char; @es
         
     | 
| 
      
 116 
     | 
    
         
            +
                  when @cr     then @flag = @cr; nil
         
     | 
| 
      
 117 
     | 
    
         
            +
                  when @lf     then @flag = @lf; nil
         
     | 
| 
      
 118 
     | 
    
         
            +
                  when @es,nil then              nil
         
     | 
| 
       114 
119 
     | 
    
         
             
                  end
         
     | 
| 
       115 
     | 
    
         
            -
                else #  
     | 
| 
      
 120 
     | 
    
         
            +
                else # consume unquoted cell
         
     | 
| 
       116 
121 
     | 
    
         
             
                  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
         
     | 
| 
       117 
     | 
    
         
            -
                  @char =  
     | 
| 
      
 122 
     | 
    
         
            +
                  @char = peek(1)
         
     | 
| 
       118 
123 
     | 
    
         
             
                  @char == @sep and @flag = @es and next_char
         
     | 
| 
       119 
124 
     | 
    
         
             
                  match
         
     | 
| 
       120 
125 
     | 
    
         
             
                end
         
     | 
| 
         @@ -158,6 +163,9 @@ class Censive < StringScanner 
     | 
|
| 
       158 
163 
     | 
    
         
             
              def <<(row)
         
     | 
| 
       159 
164 
     | 
    
         
             
                @out or return super
         
     | 
| 
       160 
165 
     | 
    
         | 
| 
      
 166 
     | 
    
         
            +
                # drop trailing seps, if specified
         
     | 
| 
      
 167 
     | 
    
         
            +
                row.pop while row.last.empty? if @drop
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
       161 
169 
     | 
    
         
             
                # most compact export format
         
     | 
| 
       162 
170 
     | 
    
         
             
                s,q = @sep, @quote
         
     | 
| 
       163 
171 
     | 
    
         
             
                out = case @mode
         
     | 
| 
         @@ -178,9 +186,6 @@ class Censive < StringScanner 
     | 
|
| 
       178 
186 
     | 
    
         
             
                  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
         
     | 
| 
       179 
187 
     | 
    
         
             
                end.join(s)
         
     | 
| 
       180 
188 
     | 
    
         | 
| 
       181 
     | 
    
         
            -
                # drop trailing seps, if specified
         
     | 
| 
       182 
     | 
    
         
            -
                out.gsub!(/#{s}+\z/,'') if @drop
         
     | 
| 
       183 
     | 
    
         
            -
             
     | 
| 
       184 
189 
     | 
    
         
             
                # write output, using desired line endings
         
     | 
| 
       185 
190 
     | 
    
         
             
                @out << out + @eol
         
     | 
| 
       186 
191 
     | 
    
         
             
              end
         
     | 
| 
         @@ -203,81 +208,25 @@ __END__ 
     | 
|
| 
       203 
208 
     | 
    
         | 
| 
       204 
209 
     | 
    
         
             
            # ==[ Playground... ]==
         
     | 
| 
       205 
210 
     | 
    
         | 
| 
       206 
     | 
    
         
            -
            STDOUT.sync = true
         
     | 
| 
       207 
     | 
    
         
            -
             
     | 
| 
       208 
     | 
    
         
            -
             
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
             
     | 
| 
       211 
     | 
    
         
            -
             
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
       213 
     | 
    
         
            -
             
     | 
| 
       214 
     | 
    
         
            -
             
     | 
| 
       215 
     | 
    
         
            -
             
     | 
| 
       216 
     | 
    
         
            -
             
     | 
| 
       217 
     | 
    
         
            -
            full = 0
         
     | 
| 
       218 
     | 
    
         
            -
             
     | 
| 
       219 
     | 
    
         
            -
            ARGV.each do |path|
         
     | 
| 
       220 
     | 
    
         
            -
              File.file?(path) or next
         
     | 
| 
       221 
     | 
    
         
            -
             
     | 
| 
       222 
     | 
    
         
            -
              print "Processing #{path.inspect}"
         
     | 
| 
       223 
     | 
    
         
            -
             
     | 
| 
       224 
     | 
    
         
            -
              rows.clear
         
     | 
| 
       225 
     | 
    
         
            -
              cols.clear
         
     | 
| 
       226 
     | 
    
         
            -
              seen = 0
         
     | 
| 
       227 
     | 
    
         
            -
              coun += 1
         
     | 
| 
       228 
     | 
    
         
            -
             
     | 
| 
       229 
     | 
    
         
            -
              dest = "#{path}-#{rand}"
         
     | 
| 
       230 
     | 
    
         
            -
             
     | 
| 
       231 
     | 
    
         
            -
              begin
         
     | 
| 
       232 
     | 
    
         
            -
                Censive.writer(dest) do |file|
         
     | 
| 
       233 
     | 
    
         
            -
                  Censive.new(File.read(path), relax: true).each do |cols|
         
     | 
| 
       234 
     | 
    
         
            -
                    cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
         
     | 
| 
       235 
     | 
    
         
            -
                    file << cols
         
     | 
| 
       236 
     | 
    
         
            -
                    seen += 1
         
     | 
| 
       237 
     | 
    
         
            -
                    print "." if (seen % 1e5) == 0
         
     | 
| 
       238 
     | 
    
         
            -
                  end
         
     | 
| 
       239 
     | 
    
         
            -
                end
         
     | 
| 
       240 
     | 
    
         
            -
                FileUtils.mv(dest, path)
         
     | 
| 
       241 
     | 
    
         
            -
                full += (seen - 1)
         
     | 
| 
       242 
     | 
    
         
            -
                puts " (#{seen - 1} rows of data)"
         
     | 
| 
       243 
     | 
    
         
            -
              rescue
         
     | 
| 
       244 
     | 
    
         
            -
                puts " - unable to process (#{$!})"
         
     | 
| 
       245 
     | 
    
         
            -
                FileUtils.rm_f(dest)
         
     | 
| 
       246 
     | 
    
         
            -
              end
         
     | 
| 
       247 
     | 
    
         
            -
            end
         
     | 
| 
       248 
     | 
    
         
            -
             
     | 
| 
       249 
     | 
    
         
            -
            puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
         
     | 
| 
       250 
     | 
    
         
            -
             
     | 
| 
       251 
     | 
    
         
            -
            __END__
         
     | 
| 
       252 
     | 
    
         
            -
            ,"CHUI, LOK HANG "BENNY",   =>   ,"""CHUI, LOK HANG ""BENNY""",
         
     | 
| 
      
 211 
     | 
    
         
            +
            # STDOUT.sync = true
         
     | 
| 
      
 212 
     | 
    
         
            +
            #
         
     | 
| 
      
 213 
     | 
    
         
            +
            # data = File.read('1.csv')
         
     | 
| 
      
 214 
     | 
    
         
            +
            #
         
     | 
| 
      
 215 
     | 
    
         
            +
            # Censive.writer('out.csv') do |out|
         
     | 
| 
      
 216 
     | 
    
         
            +
            #   Censive.new(data, relax: true, excel: true).each do |row|
         
     | 
| 
      
 217 
     | 
    
         
            +
            #     out << row
         
     | 
| 
      
 218 
     | 
    
         
            +
            #   end
         
     | 
| 
      
 219 
     | 
    
         
            +
            # end
         
     | 
| 
      
 220 
     | 
    
         
            +
            #
         
     | 
| 
      
 221 
     | 
    
         
            +
            # __END__
         
     | 
| 
       253 
222 
     | 
    
         | 
| 
       254 
     | 
    
         
            -
             
     | 
| 
      
 223 
     | 
    
         
            +
            ARGV << "z.csv" if ARGV.empty?
         
     | 
| 
       255 
224 
     | 
    
         | 
| 
       256 
     | 
    
         
            -
             
     | 
| 
      
 225 
     | 
    
         
            +
            path = ARGV.first
         
     | 
| 
      
 226 
     | 
    
         
            +
            data = File.read(path)
         
     | 
| 
       257 
227 
     | 
    
         | 
| 
      
 228 
     | 
    
         
            +
            csv = Censive.new(data)
         
     | 
| 
       258 
229 
     | 
    
         | 
| 
       259 
     | 
    
         
            -
            data  
     | 
| 
      
 230 
     | 
    
         
            +
            data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
         
     | 
| 
       260 
231 
     | 
    
         | 
| 
       261 
     | 
    
         
            -
             
     | 
| 
       262 
     | 
    
         
            -
              Censive.new(data).each do |row|
         
     | 
| 
       263 
     | 
    
         
            -
                out << row
         
     | 
| 
       264 
     | 
    
         
            -
              end
         
     | 
| 
       265 
     | 
    
         
            -
            end
         
     | 
| 
       266 
     | 
    
         
            -
             
     | 
| 
       267 
     | 
    
         
            -
            # ARGV << "z.csv" if ARGV.empty?
         
     | 
| 
       268 
     | 
    
         
            -
            #
         
     | 
| 
       269 
     | 
    
         
            -
            # case 1
         
     | 
| 
       270 
     | 
    
         
            -
            # when 1
         
     | 
| 
       271 
     | 
    
         
            -
            #   path = ARGV.first
         
     | 
| 
       272 
     | 
    
         
            -
            #   data = File.read(path)
         
     | 
| 
       273 
     | 
    
         
            -
            # when 2
         
     | 
| 
       274 
     | 
    
         
            -
            #   data = DATA.gets("\n\n").rstrip
         
     | 
| 
       275 
     | 
    
         
            -
            # end
         
     | 
| 
       276 
     | 
    
         
            -
            #
         
     | 
| 
       277 
     | 
    
         
            -
            # STDOUT.sync = true
         
     | 
| 
       278 
     | 
    
         
            -
            #
         
     | 
| 
       279 
     | 
    
         
            -
            # csv = Censive.new(data)
         
     | 
| 
       280 
     | 
    
         
            -
            #
         
     | 
| 
       281 
     | 
    
         
            -
            # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
         
     | 
| 
       282 
     | 
    
         
            -
            #
         
     | 
| 
       283 
     | 
    
         
            -
            # csv.stats
         
     | 
| 
      
 232 
     | 
    
         
            +
            csv.stats
         
     |