RubyGems - iriq - Versions diffs - 0.0.1 → 0.2.0 - Mend

iriq 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +25 -0
data/CLAUDE.md +121 -0
data/Gemfile.lock +8 -2
data/Makefile +56 -0
data/README.md +334 -39
data/iriq.gemspec +4 -3
data/lib/iriq/cli.rb +289 -100
data/lib/iriq/cluster.rb +47 -0
data/lib/iriq/clusterer.rb +29 -39
data/lib/iriq/corpus.rb +322 -0
data/lib/iriq/explanation.rb +6 -22
data/lib/iriq/extractor.rb +125 -0
data/lib/iriq/identifier.rb +11 -3
data/lib/iriq/inflector.rb +145 -0
data/lib/iriq/normalizer.rb +11 -8
data/lib/iriq/observation.rb +25 -0
data/lib/iriq/parser.rb +1 -1
data/lib/iriq/path_shape.rb +27 -9
data/lib/iriq/position_stats.rb +64 -0
data/lib/iriq/segment_classifier.rb +31 -7
data/lib/iriq/segment_hints.rb +32 -0
data/lib/iriq/storage/json.rb +43 -0
data/lib/iriq/storage/memory.rb +138 -0
data/lib/iriq/storage/sqlite.rb +367 -0
data/lib/iriq/storage.rb +35 -0
data/lib/iriq/version.rb +1 -1
data/lib/iriq.rb +11 -0
metadata +29 -4

data/lib/iriq/cli.rb CHANGED Viewed

@@ -1,32 +1,55 @@
 require "json"
 require "optparse"
+require "stringio"
 module Iriq
-  # Tiny CLI wrapper around the public API. Construct with explicit IO so
-  # specs can run it without shelling out.
+  # Flag-driven CLI. The default action for an input is a combined parse +
+  # normalize + explain summary; the -p/-n/-e flags select individual
+  # sections. The only subcommand is `cluster`, which is structurally
+  # different (many inputs, not one). Construct with explicit IO so specs
+  # can run it without shelling out.
   class CLI
-    COMMANDS = %w[parse normalize explain classify cluster help version].freeze
+    SECTION_FLAGS = %i[parse normalize].freeze
+    TOP_N_STATS   = 10
+    # When extraction yields this many or more IRIs, the default pipe
+    # output switches from a URL list to clusters — a longer list is
+    # easier to read as route-shape groups.
+    LARGE_BATCH_THRESHOLD = 10
     USAGE = <<~TXT
-      Usage: iriq <command> [options] [args]
+      Usage: iriq [options] <input>
+             iriq [options] < text
+             iriq cluster [options] [file]
+      <input> may be an IRI, a file path (extracted automatically), or piped
+      text via stdin.
+      Sections (combine freely):
+        -n, --normalize       Shape-normalized form
+        -p, --parse           Parsed fields
-      Commands:
-        parse <input>          Parse an identifier and print its fields
-        normalize <input>      Print the shape-normalized form
-        explain <input>        Annotate each path segment
-        classify <segment>     Classify a single segment
-        cluster [file]         Cluster identifiers from FILE or stdin (one per line)
-        help                   Show this message
-        version                Print version
+      Corpus + stats:
+            --corpus PATH     Load/create a JSON corpus; observe and save atomically.
+                              -n becomes corpus-informed once it has data.
+            --stats           Print rolling aggregates
-      Options:
-        -j, --json             Emit JSON instead of human-readable output
-        -h, --help             Show this message
+      Other:
+        -h, --help            Show this message
+        -j, --json            Emit JSON instead of human-readable output
+        -N, --no-hints        Use {integer_id} placeholders instead of {user_id}
+            --no-scheme-less  Skip foo.com/path extraction (explicit-scheme only)
+        -V, --version         Print version
+      Subcommands:
+        cluster [file]        Force cluster view (default for ≥10 IRIs anyway)
       Examples:
-        iriq parse https://foo.com/users/123
-        iriq normalize foo.com/users/456
-        echo "https://foo.com/users/1\\nhttps://foo.com/users/2" | iriq cluster
+        iriq foo.com/users/456
+        iriq -n https://foo.com/users/123
+        iriq ./access.log                     # auto-detect file → extract URLs
+        cat README.md | iriq -n               # one normalized URL per line
+        cat README.md | iriq --corpus c.json
     TXT
     attr_reader :stdin, :stdout, :stderr
@@ -41,16 +64,35 @@ module Iriq
     def run(argv)
       args, opts = parse_options(argv)
-      cmd = args.shift
-      return print_usage(stdout, 0) if cmd.nil? || cmd == "help" || opts[:help]
+      return print_usage(stdout, 0) if opts[:help]
+      return print_version          if opts[:version]
+      explicit_cluster = (args.first == "cluster")
+      args.shift if explicit_cluster
+      # Auto-detect: a positional argument that isn't parseable as an IRI
+      # but IS an existing file gets treated as a file to extract from. This
+      # is what makes `iriq ./access.log` and `iriq /var/log/foo.log` Just
+      # Work without a separate --extract flag.
+      positional_is_file = args.first && File.file?(args.first) && !parseable_iri?(args.first)
+      batch_mode = explicit_cluster || positional_is_file ||
+                   (args.empty? && piped_stdin?)
+      return print_usage(stdout, 0) if args.empty? && !batch_mode
+      corpus = opts[:corpus] ? load_corpus(opts[:corpus]) : nil
-      unless COMMANDS.include?(cmd)
-        stderr.puts "iriq: unknown command #{cmd.inspect}"
-        print_usage(stderr, 1)
-        return 1
+      code = if batch_mode
+        cmd_batch(args, opts, corpus, explicit_cluster: explicit_cluster)
+      elsif opts[:stats]
+        cmd_stats(corpus, opts)
+      else
+        cmd_summary(args, opts, corpus)
       end
-      send("cmd_#{cmd}", args, opts)
+      corpus.save(opts[:corpus]) if corpus && opts[:corpus]
+      code
     rescue Iriq::ParseError => e
       stderr.puts "iriq: parse error: #{e.message}"
       2
@@ -59,89 +101,178 @@ module Iriq
       1
     end
+    def parseable_iri?(input)
+      Iriq.parse(input)
+      true
+    rescue Iriq::ParseError
+      false
+    end
     private
     def parse_options(argv)
-      opts = { json: false, help: false }
+      opts = {
+        json:        false,
+        help:        false,
+        version:     false,
+        hints:       true,
+        sections:    [],
+        corpus:      nil,
+        stats:       false,
+        scheme_less: true,
+      }
       parser = OptionParser.new do |o|
-        o.on("-j", "--json") { opts[:json] = true }
-        o.on("-h", "--help") { opts[:help] = true }
+        o.on("-p", "--parse")        { opts[:sections] << :parse }
+        o.on("-n", "--normalize")    { opts[:sections] << :normalize }
+        o.on("-j", "--json")         { opts[:json]    = true }
+        o.on("--[no-]hints")         { |v| opts[:hints] = v }
+        o.on("-N")                   { opts[:hints] = false }
+        o.on("--corpus PATH")        { |v| opts[:corpus] = v }
+        o.on("--stats")              { opts[:stats]   = true }
+        o.on("--[no-]scheme-less")   { |v| opts[:scheme_less] = v }
+        o.on("-h", "--help")         { opts[:help]    = true }
+        o.on("-V", "--version")      { opts[:version] = true }
       end
       args = parser.parse(argv)
       [args, opts]
     end
-    def print_usage(io, code)
-      io.puts USAGE
-      code
+    def piped_stdin?
+      # StringIO is the test injection point; treat it as "piped" only when
+      # it actually has content. Real stdin: tty? tells us.
+      if stdin.is_a?(StringIO)
+        stdin.size.positive?
+      elsif stdin.respond_to?(:tty?)
+        !stdin.tty?
+      else
+        true
+      end
     end
-    def require_arg!(args, name)
-      return args.first if args.first
+    def load_corpus(path)
+      Corpus.open(path)
+    end
-      stderr.puts "iriq: missing argument <#{name}>"
-      throw :missing_arg, 1
+    def print_usage(io, code)
+      io.puts USAGE
+      code
     end
-    def cmd_version(_args, _opts)
+    def print_version
       stdout.puts Iriq::VERSION
       0
     end
-    def cmd_parse(args, opts)
-      input = args.first or return missing(:input)
-      iri   = Iriq.parse(input)
-      emit_parse(iri, opts)
+    def cmd_summary(args, opts, corpus)
+      input    = args.first or return missing(:input)
+      iri      = Iriq.parse(input)
+      corpus&.observe(iri)
+      sections = opts[:sections].empty? ? SECTION_FLAGS : opts[:sections]
+      data = {}
+      data[:parse]     = identifier_hash(iri) if sections.include?(:parse)
+      if sections.include?(:normalize)
+        data[:normalize] = corpus ? corpus.normalize(iri) : Normalizer.normalize_identifier(iri, hints: opts[:hints])
+      end
+      if opts[:json]
+        payload = sections.size == 1 ? data.values.first : data
+        stdout.puts JSON.generate(payload)
+      else
+        emit_sections(data, sections)
+      end
       0
     end
-    def cmd_normalize(args, opts)
-      input = args.first or return missing(:input)
-      out   = Iriq.normalize(input)
-      opts[:json] ? stdout.puts(JSON.generate(normalized: out)) : stdout.puts(out)
+    # Used for the `cluster` subcommand and implicit piped batch mode. Reads
+    # the whole input as text and runs it through the extractor — so a file
+    # of URLs (one per line) and a file of prose with URLs both work. The
+    # corpus is ephemeral unless --corpus was given.
+    def cmd_batch(args, opts, corpus, explicit_cluster: false)
+      corpus ||= Corpus.new
+      iris = extract_text(read_text(args.first), opts)
+      corpus.batch { iris.each { |iri| corpus.observe(iri) } }
+      if opts[:sections].any?
+        emit_per_iri_sections(iris, opts)
+      elsif opts[:stats]
+        emit_stats(corpus, opts)
+      elsif explicit_cluster || iris.size >= LARGE_BATCH_THRESHOLD
+        # Either the user asked for clusters explicitly, or the input is
+        # big enough that the cluster summary beats a long URL list.
+        emit_clusters(corpus.clusters, opts)
+      else
+        emit_url_list(iris, opts)
+      end
       0
     end
-    def cmd_explain(args, opts)
-      input = args.first or return missing(:input)
-      rows  = Iriq.explain(input)
+    # Emit the requested sections (parse/normalize/explain) for each
+    # extracted IRI. -n alone is the cleanest case: one line per URL.
+    def emit_per_iri_sections(iris, opts)
+      sections = opts[:sections]
+      payloads = iris.map { |iri| section_payload(iri, sections, opts) }
       if opts[:json]
-        stdout.puts JSON.generate(rows)
+        out = sections.size == 1 ? payloads.map(&:values).flatten(1) : payloads
+        stdout.puts JSON.generate(out)
+      elsif sections == [:normalize]
+        # Most common case — keep it tight: one URL per line, no headers.
+        payloads.each { |p| stdout.puts p[:normalize] }
       else
-        rows.each do |r|
-          mark = r[:variable] ? "*" : " "
-          stdout.printf("%s %-12s %s\n", mark, r[:type], r[:value])
+        payloads.each_with_index do |p, i|
+          stdout.puts if i > 0
+          stdout.puts "# #{iris[i].canonical}"
+          sections.each_with_index do |sec, j|
+            stdout.puts if j > 0  # blank line between sections within one IRI
+            case sec
+            when :parse     then emit_parse_human(p[:parse])
+            when :normalize then stdout.puts p[:normalize]
+            end
+          end
         end
       end
-      0
     end
-    def cmd_classify(args, opts)
-      seg  = args.first or return missing(:segment)
-      type = SegmentClassifier.new.classify(seg)
-      opts[:json] ? stdout.puts(JSON.generate(value: seg, type: type)) : stdout.puts(type)
-      0
+    def section_payload(iri, sections, opts)
+      data = {}
+      data[:parse]     = identifier_hash(iri)                                       if sections.include?(:parse)
+      data[:normalize] = Normalizer.normalize_identifier(iri, hints: opts[:hints])  if sections.include?(:normalize)
+      data
     end
-    def cmd_cluster(args, opts)
-      lines = read_input(args.first)
-      clusterer = Clusterer.new
-      lines.each do |line|
-        line = line.strip
-        next if line.empty?
+    def extract_text(text, opts)
+      Extractor.new(scheme_less: opts[:scheme_less]).extract(text)
+    end
-        begin
-          clusterer.add(line)
-        rescue Iriq::ParseError => e
-          stderr.puts "iriq: skipped #{line.inspect}: #{e.message}"
-        end
+    # Emit a deduplicated list of IRIs with occurrence counts, sorted desc
+    # by count then by first-seen order. If every IRI is a singleton the
+    # `[1]` prefix is omitted — just print the URLs.
+    def emit_url_list(iris, opts)
+      counts = Hash.new(0)
+      first  = {}
+      iris.each_with_index do |iri, i|
+        key = iri.canonical
+        counts[key] += 1
+        first[key] ||= i
+      end
+      sorted = counts.sort_by { |k, c| [-c, first[k]] }
+      if opts[:json]
+        stdout.puts JSON.generate(sorted.map { |k, c| { iri: k, count: c } })
+      elsif sorted.all? { |_, c| c == 1 }
+        sorted.each { |k, _| stdout.puts k }
+      else
+        sorted.each { |k, c| stdout.puts "[#{c}] #{k}" }
       end
-      emit_clusters(clusterer.clusters, opts)
-      0
     end
-    def cmd_help(_args, _opts)
-      print_usage(stdout, 0)
+    def cmd_stats(corpus, opts)
+      return missing("--corpus") unless corpus
+      emit_stats(corpus, opts)
+      0
     end
     def missing(name)
@@ -157,49 +288,107 @@ module Iriq
       end
     end
-    def emit_parse(iri, opts)
-      if opts[:json]
-        stdout.puts JSON.generate(
-          original:      iri.original,
-          kind:          iri.kind,
-          scheme:        iri.scheme,
-          host:          iri.host,
-          port:          iri.port,
-          path_segments: iri.path_segments,
-          query_params:  iri.query_params,
-          fragment:      iri.fragment,
-          nss:           iri.nss,
-          canonical:     iri.canonical,
-        )
+    def read_text(path)
+      if path.nil? || path == "-"
+        stdin.read
       else
-        stdout.puts "original:      #{iri.original}"
-        stdout.puts "kind:          #{iri.kind}"
-        stdout.puts "scheme:        #{iri.scheme}" if iri.scheme
-        stdout.puts "host:          #{iri.host}"   if iri.host
-        stdout.puts "port:          #{iri.port}"   if iri.port
-        stdout.puts "path_segments: #{iri.path_segments.inspect}" if iri.url?
-        unless iri.query_params.empty?
-          stdout.puts "query_params:  #{iri.query_params.inspect}"
+        File.read(path)
+      end
+    end
+    # Compact identifier hash for parse output (both JSON and human). Drops
+    # nil values and empty collections so URN dumps don't carry empty
+    # host/path/query slots, and URL dumps don't include null fragment/nss.
+    def identifier_hash(iri)
+      {
+        original:      iri.original,
+        kind:          iri.kind,
+        scheme:        iri.scheme,
+        host:          iri.host,
+        port:          iri.port,
+        path_segments: iri.path_segments,
+        query_params:  iri.query_params,
+        fragment:      iri.fragment,
+        nss:           iri.nss,
+        canonical:     iri.canonical,
+      }.reject { |_, v| v.nil? || (v.respond_to?(:empty?) && v.empty?) }
+    end
+    def emit_sections(data, sections)
+      multi = sections.size > 1
+      sections.each_with_index do |sec, i|
+        stdout.puts if i > 0
+        stdout.puts "# #{sec}" if multi
+        case sec
+        when :parse     then emit_parse_human(data[:parse])
+        when :normalize then stdout.puts data[:normalize]
         end
-        stdout.puts "fragment:      #{iri.fragment}" if iri.fragment
-        stdout.puts "nss:           #{iri.nss}"      if iri.nss
-        stdout.puts "canonical:     #{iri.canonical}"
       end
     end
+    def emit_parse_human(h)
+      stdout.puts "original:      #{h[:original]}"
+      stdout.puts "kind:          #{h[:kind]}"
+      stdout.puts "scheme:        #{h[:scheme]}" if h[:scheme]
+      stdout.puts "host:          #{h[:host]}"   if h[:host]
+      stdout.puts "port:          #{h[:port]}"   if h[:port]
+      stdout.puts "path_segments: #{h[:path_segments].inspect}" if h[:kind] == :url
+      stdout.puts "query_params:  #{h[:query_params].inspect}" if h[:query_params] && !h[:query_params].empty?
+      stdout.puts "fragment:      #{h[:fragment]}" if h[:fragment]
+      stdout.puts "nss:           #{h[:nss]}"      if h[:nss]
+      stdout.puts "canonical:     #{h[:canonical]}"
+    end
     def emit_clusters(clusters, opts)
       sorted = clusters.sort_by { |c| -c.count }
       if opts[:json]
         stdout.puts JSON.generate(sorted.map(&:to_h))
       else
-        sorted.each do |c|
-          host = c.host || "(urn)"
-          stdout.puts "[#{c.count}] #{host}  #{c.shape}"
+        sorted.each_with_index do |c, i|
+          stdout.puts if i > 0
+          host  = c.host || "(urn)"
+          shape = opts[:hints] ? c.shape : raw_shape_for(c)
+          stdout.puts "[#{c.count}] #{host}  #{shape}"
           c.examples.first(3).each { |e| stdout.puts "    #{e.canonical}" }
           stdout.puts "    + #{c.count - 3} more" if c.count > 3
         end
       end
     end
+    def raw_shape_for(cluster)
+      example = cluster.examples.first
+      return cluster.shape unless example
+      PathShape.for(example.path_segments, hints: false)
+    end
+    def emit_stats(corpus, opts)
+      payload = {
+        observations: corpus.host_counts.values.sum,
+        clusters:     corpus.size,
+        hosts:        top(corpus.host_counts),
+        shapes:       top(corpus.fingerprint_counts),
+        raw_shapes:   top(corpus.raw_shape_counts),
+      }
+      if opts[:json]
+        stdout.puts JSON.generate(payload)
+      else
+        stdout.puts "observations: #{payload[:observations]}"
+        stdout.puts "clusters:     #{payload[:clusters]}"
+        stdout.puts
+        stdout.puts "top hosts:"
+        payload[:hosts].each { |h, n| stdout.puts "  #{n.to_s.rjust(6)}  #{h}" }
+        stdout.puts
+        stdout.puts "top shapes:"
+        shapes = opts[:hints] ? payload[:shapes] : payload[:raw_shapes]
+        shapes.each { |s, n| stdout.puts "  #{n.to_s.rjust(6)}  #{s}" }
+      end
+    end
+    def top(hash)
+      hash.sort_by { |_, n| -n }.first(TOP_N_STATS).to_h
+    end
   end
 end

data/lib/iriq/cluster.rb CHANGED Viewed

@@ -54,5 +54,52 @@ module Iriq
         segments: segment_stats,
       }
     end
+    # JSON-friendly dump for persistence (distinct from #to_h which is a
+    # display form). Examples are dumped as canonical strings and re-parsed
+    # on load.
+    def dump
+      {
+        "key"            => key,
+        "host"           => host,
+        "scheme"         => scheme,
+        "shape"          => shape,
+        "count"          => count,
+        "examples"       => examples.map(&:canonical),
+        "segment_counts" => @segment_counts.map { |h| h || {} },
+      }
+    end
+    def self.from_dump(h)
+      cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
+      cluster.instance_variable_set(:@count, h["count"])
+      cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
+      cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
+      cluster
+    end
+    # Shared cluster-key derivation. Returns [key, host, scheme, shape] —
+    # callers that already have a hinted shape can pass it in to skip the
+    # recomputation; URN inputs ignore the override and always derive their
+    # own shape from the NSS value.
+    def self.key_for(iri, classifier:, shape: nil)
+      if iri.urn?
+        ns, value = (iri.nss || "").split(":", 2)
+        derived = value ? urn_value_shape(ns, value, classifier) : nil
+        key     = "urn:#{ns}:#{derived}"
+        [key, nil, "urn", key]
+      else
+        shape ||= PathShape.new(classifier: classifier).for(iri.path_segments)
+        key = "#{iri.scheme}://#{iri.host}#{shape}"
+        [key, iri.host, iri.scheme, shape]
+      end
+    end
+    def self.urn_value_shape(ns, value, classifier)
+      entry = SegmentHints.derive([ns, value], classifier).last
+      return entry[:value] unless entry[:variable]
+      "{#{entry[:hint] || entry[:type]}}"
+    end
   end
 end

data/lib/iriq/clusterer.rb CHANGED Viewed

@@ -3,31 +3,28 @@ module Iriq
   # `clusters` to read out the groups. `explain` annotates a single identifier
   # against the cluster it would fall into, including which positions are
   # stable across all observed members.
+  #
+  # Implemented as a thin wrapper over Storage::Memory — the same code path
+  # Corpus uses for the cluster portion of its state, so there's only one
+  # place that knows how clusters get stored.
   class Clusterer
-    def initialize(classifier: SegmentClassifier.new)
+    def initialize(classifier: SegmentClassifier::DEFAULT)
       @classifier = classifier
-      @clusters   = {}
+      @storage    = Storage::Memory.new(classifier: classifier)
     end
-    def add(input)
+    def add(input, shape: nil)
       iri = coerce(input)
-      key, host, scheme, shape = cluster_key(iri)
-      cluster = @clusters[key] ||= Cluster.new(
-        key:    key,
-        host:   host,
-        scheme: scheme,
-        shape:  shape,
-      )
-      cluster.add(iri)
-      cluster
+      key, host, scheme, derived = Cluster.key_for(iri, classifier: @classifier, shape: shape)
+      @storage.add_to_cluster(key, host, scheme, derived, iri)
     end
     def clusters
-      @clusters.values
+      @storage.clusters
     end
     def size
-      @clusters.size
+      @storage.cluster_size
     end
     # Returns a per-segment explanation for the input, merging classifier
@@ -36,42 +33,35 @@ module Iriq
     # would otherwise call them variable).
     def explain(input)
       iri = coerce(input)
-      key, * = cluster_key(iri)
-      cluster = @clusters[key]
+      key, * = Cluster.key_for(iri, classifier: @classifier)
+      cluster = clusters.find { |c| c.key == key }
       stats   = cluster ? cluster.segment_stats : []
+      hinted  = SegmentHints.derive(iri.path_segments, @classifier)
-      iri.path_segments.each_with_index.map do |seg, i|
-        type   = @classifier.classify(seg)
+      hinted.each_with_index.map do |entry, i|
         stable = stats[i] && stats[i][:stable]
-        {
-          value:    seg,
-          type:     type,
-          variable: !stable && @classifier.variable?(type),
+        entry.merge(
+          variable: !stable && entry[:variable],
           stable:   !!stable,
-        }
+        )
       end
     end
+    def dump
+      { "clusters" => clusters.each_with_object({}) { |c, h| h[c.key] = c.dump } }
+    end
+    def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
+      c = new(classifier: classifier)
+      restored = h["clusters"].transform_values { |cdump| Cluster.from_dump(cdump) }
+      c.instance_variable_get(:@storage).instance_variable_set(:@clusters, restored)
+      c
+    end
     private
     def coerce(input)
       input.is_a?(Identifier) ? input : Parser.parse(input)
     end
-    def cluster_key(iri)
-      if iri.urn?
-        ns, value = (iri.nss || "").split(":", 2)
-        shape = if value
-          type = @classifier.classify(value)
-          @classifier.variable?(type) ? "{#{type}}" : value
-        end
-        key = "urn:#{ns}:#{shape}"
-        [key, nil, "urn", key]
-      else
-        shape = PathShape.new(classifier: @classifier).for(iri.path_segments)
-        key   = "#{iri.scheme}://#{iri.host}#{shape}"
-        [key, iri.host, iri.scheme, shape]
-      end
-    end
   end
 end