fluent-plugin-openlineage 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +6 -0
  3. data/.github/workflows/linux.yml +30 -0
  4. data/.gitignore +16 -0
  5. data/.idea/.gitignore +8 -0
  6. data/.idea/fluentd.iml +204 -0
  7. data/.idea/misc.xml +4 -0
  8. data/.idea/modules/benchmark-memory-0.2.iml +12 -0
  9. data/.idea/modules/bigdecimal-3.1.iml +11 -0
  10. data/.idea/modules/certstore_c-0.1.iml +15 -0
  11. data/.idea/modules/concurrent-ruby-1.3.iml +18 -0
  12. data/.idea/modules/concurrent-ruby-1.31.iml +15 -0
  13. data/.idea/modules/connection_pool-2.4.iml +11 -0
  14. data/.idea/modules/cool.io-1.8.iml +16 -0
  15. data/.idea/modules/drb-2.2.iml +14 -0
  16. data/.idea/modules/drb-2.21.iml +11 -0
  17. data/.idea/modules/ffi-1.17.iml +20 -0
  18. data/.idea/modules/ffi-win32-extensions-1.0.iml +19 -0
  19. data/.idea/modules/fluentd-1.17.iml +43 -0
  20. data/.idea/modules/http_parser.rb-0.8.iml +17 -0
  21. data/.idea/modules/json-2.7.iml +14 -0
  22. data/.idea/modules/json-2.71.iml +11 -0
  23. data/.idea/modules/msgpack-1.7.iml +15 -0
  24. data/.idea/modules/mutex_m-0.2.iml +15 -0
  25. data/.idea/modules/new_gem.iml +15 -0
  26. data/.idea/modules/power_assert-2.0.iml +19 -0
  27. data/.idea/modules/rake-13.2.iml +18 -0
  28. data/.idea/modules/rake-13.21.iml +15 -0
  29. data/.idea/modules/rake-compiler-1.2.iml +13 -0
  30. data/.idea/modules/rusty_json_schema-0.15.iml +15 -0
  31. data/.idea/modules/serverengine-2.3.iml +17 -0
  32. data/.idea/modules/sigdump-0.2.iml +16 -0
  33. data/.idea/modules/specifications.iml +14 -0
  34. data/.idea/modules/specifications1.iml +11 -0
  35. data/.idea/modules/strptime-0.2.iml +16 -0
  36. data/.idea/modules/thermite-0.13.iml +17 -0
  37. data/.idea/modules/webrick-1.8.iml +18 -0
  38. data/.idea/modules/win32-event-0.6.iml +21 -0
  39. data/.idea/modules/win32-ipc-0.7.iml +20 -0
  40. data/.idea/modules/yajl-ruby-1.4.iml +779 -0
  41. data/.idea/modules.xml +41 -0
  42. data/.rspec +2 -0
  43. data/ChangeLog +3 -0
  44. data/Gemfile +3 -0
  45. data/LICENSE +202 -0
  46. data/README.md +250 -0
  47. data/Rakefile +13 -0
  48. data/fluent-plugin-openlineage.gemspec +28 -0
  49. data/lib/fluent/plugin/parser_openlineage.rb +182 -0
  50. data/misc/fluent.conf +101 -0
  51. data/misc/test-complete.json +73 -0
  52. data/misc/test-start.json +73 -0
  53. data/spec/Naming.md +500 -0
  54. data/spec/OpenLineage.json +304 -0
  55. data/spec/Versioning.md +49 -0
  56. data/spec/events/event_full.json +206 -0
  57. data/spec/events/event_invalid_dataset_facet.json +31 -0
  58. data/spec/events/event_invalid_input_dataset_facet.json +29 -0
  59. data/spec/events/event_invalid_job_facet.json +26 -0
  60. data/spec/events/event_invalid_output_dataset_facet.json +29 -0
  61. data/spec/events/event_invalid_run_facet.json +28 -0
  62. data/spec/events/event_no_run_id.json +28 -0
  63. data/spec/events/event_simple.json +29 -0
  64. data/spec/facets/ColumnLineageDatasetFacet.json +96 -0
  65. data/spec/facets/ColumnLineageDatasetFacet.md +106 -0
  66. data/spec/facets/DataQualityAssertionsDatasetFacet.json +49 -0
  67. data/spec/facets/DataQualityMetricsInputDatasetFacet.json +76 -0
  68. data/spec/facets/DatasetVersionDatasetFacet.json +31 -0
  69. data/spec/facets/DatasourceDatasetFacet.json +32 -0
  70. data/spec/facets/DocumentationDatasetFacet.json +31 -0
  71. data/spec/facets/DocumentationJobFacet.json +30 -0
  72. data/spec/facets/ErrorMessageRunFacet.json +41 -0
  73. data/spec/facets/ExternalQueryRunFacet.json +36 -0
  74. data/spec/facets/ExternalQueryRunFacet.md +49 -0
  75. data/spec/facets/ExtractionErrorRunFacet.json +58 -0
  76. data/spec/facets/JobTypeJobFacet.json +41 -0
  77. data/spec/facets/LifecycleStateChangeDatasetFacet.json +46 -0
  78. data/spec/facets/NominalTimeRunFacet.json +38 -0
  79. data/spec/facets/OutputStatisticsOutputDatasetFacet.json +36 -0
  80. data/spec/facets/OwnershipDatasetFacet.json +45 -0
  81. data/spec/facets/OwnershipJobFacet.json +45 -0
  82. data/spec/facets/ParentRunFacet.json +54 -0
  83. data/spec/facets/ProcessingEngineRunFacet.json +41 -0
  84. data/spec/facets/SQLJobFacet.json +30 -0
  85. data/spec/facets/SchemaDatasetFacet.json +59 -0
  86. data/spec/facets/SourceCodeJobFacet.json +34 -0
  87. data/spec/facets/SourceCodeLocationJobFacet.json +60 -0
  88. data/spec/facets/StorageDatasetFacet.json +35 -0
  89. data/spec/facets/SymlinksDatasetFacet.json +47 -0
  90. data/spec/fluent/plugin/test_parser_openlineage.rb +141 -0
  91. data/spec/registry/core/registry.json +31 -0
  92. data/spec/registry/gcp/facets/GcpCommonJobFacet.json +43 -0
  93. data/spec/registry/gcp/registry.json +6 -0
  94. data/spec/spec_helper.rb +8 -0
  95. data/spec/tests/ColumnLineageDatasetFacet/1.json +172 -0
  96. data/spec/tests/DataQualityAssertionsDatasetFacet/1.json +58 -0
  97. data/spec/tests/DataQualityMetricsInputDatasetFacet/1.json +23 -0
  98. data/spec/tests/DatasetVersionDatasetFacet/1.json +7 -0
  99. data/spec/tests/DatasourceDatasetFacet/1.json +7 -0
  100. data/spec/tests/DocumentationDatasetFacet/1.json +7 -0
  101. data/spec/tests/DocumentationJobFacet/1.json +7 -0
  102. data/spec/tests/ErrorMessageRunFacet/1.json +9 -0
  103. data/spec/tests/ExternalQueryRunFacet/1.json +8 -0
  104. data/spec/tests/ExtractionErrorRunFacet/1.json +15 -0
  105. data/spec/tests/JobTypeJobFacet/1.json +9 -0
  106. data/spec/tests/LifecycleStateChangeDatasetFacet/1.json +11 -0
  107. data/spec/tests/NominalTimeRunFacet/1.json +8 -0
  108. data/spec/tests/OutputStatisticsOutputDatasetFacet/1.json +9 -0
  109. data/spec/tests/OwnershipDatasetFacet/1.json +11 -0
  110. data/spec/tests/OwnershipJobFacet/1.json +11 -0
  111. data/spec/tests/ParentRunFacet/1.json +13 -0
  112. data/spec/tests/ProcessingEngineRunFacet/1.json +9 -0
  113. data/spec/tests/SQLJobFacet/1.json +7 -0
  114. data/spec/tests/SchemaDatasetFacet/1.json +92 -0
  115. data/spec/tests/SourceCodeJobFacet/1.json +8 -0
  116. data/spec/tests/SourceCodeLocationJobFacet/1.json +8 -0
  117. data/spec/tests/StorageDatasetFacet/1.json +8 -0
  118. data/spec/tests/SymlinksDatasetFacet/1.json +13 -0
  119. data/spec/tests/example_full_event.json +24 -0
  120. metadata +188 -3
@@ -0,0 +1,182 @@
1
+ require "fluent/plugin/parser"
2
+ require 'fluent/plugin/parser_json'
3
+ require 'json'
4
+ require "rusty_json_schema"
5
+
6
+ module Fluent
7
+ module Plugin
8
+ class OpenlineageParser < Fluent::Plugin::JSONParser
9
+ Fluent::Plugin.register_parser("openlineage", self)
10
+
11
+ DEFAULT_SPEC_DIRECTORY="/etc/spec"
12
+
13
+ def configure(conf)
14
+ if conf.has_key?('spec_directory')
15
+ @spec_directory = conf['spec_directory']
16
+ else
17
+ @spec_directory = DEFAULT_SPEC_DIRECTORY
18
+ end
19
+ if (not @spec_directory.end_with?("/"))
20
+ @spec_directory += "/"
21
+ end
22
+ @validate_input_dataset_facets = conf.fetch('validate_input_dataset_facets', false)
23
+ @validate_output_dataset_facets = conf.fetch('validate_output_dataset_facets', false)
24
+ @validate_dataset_facets = conf.fetch('validate_dataset_facets', false)
25
+ @validate_run_facets = conf.fetch('validate_run_facets', true)
26
+ @validate_job_facets = conf.fetch('validate_job_facets', true)
27
+ @schema = load_schema()
28
+ @validator = RustyJSONSchema.build(@schema)
29
+ super
30
+ end
31
+
32
+ # https://docs.fluentd.org/plugin-development/api-plugin-parser
33
+ def parse(text)
34
+ # parse JSON with default JSONParser
35
+ super(text) { | time, json |
36
+ validate_openlineage(json)
37
+ yield time, json
38
+ }
39
+ end
40
+
41
+ private
42
+
43
+
44
+ def validate_openlineage(json)
45
+ if json == nil
46
+ raise ParserError, "Openlineage validation failed: invalid json provided"
47
+ end
48
+
49
+ # https://github.com/driv3r/rusty_json_schema
50
+ # Rust json parser ported to ruby that supports Draft 2020-12
51
+ errors = @validator.validate(json)
52
+
53
+ if errors.join(", ").include? "is not valid under any of the given schemas"
54
+ errors = enrich_oneOf_errors(json)
55
+ end
56
+ if !errors.empty?
57
+ raise ParserError, "Openlineage validation failed: " + errors.join(", ")
58
+ end
59
+ end
60
+
61
+ # Validator returns very generic OneOfNotValid error message
62
+ # We try to find better reason for mismatch with each candidate.
63
+ def enrich_oneOf_errors(json)
64
+ errors = []
65
+ @schema["oneOf"].each { |ref|
66
+ changed_schema = Marshal.load(Marshal.dump(@schema))
67
+ changed_schema.delete("oneOf")
68
+ changed_schema["$ref"] = ref["$ref"]
69
+ validator = RustyJSONSchema.build(changed_schema)
70
+ error = validator.validate(json)
71
+ if !error.empty?
72
+ errors.append("#{ref}: #{error.join(", ")}")
73
+ end
74
+ }
75
+ return errors
76
+ end
77
+
78
+ def load_schema()
79
+ schemaFile = @spec_directory + "OpenLineage.json"
80
+
81
+ if (not File.exist?(schemaFile))
82
+ raise ParserError, "Couldn't find Openlineage.json file within a defined spec directory: " + schemaFile
83
+ end
84
+
85
+ schema = File.read(schemaFile)
86
+ schema = rewrite_schema_to_include_facets(schema)
87
+ return schema
88
+ end
89
+
90
+ # Current Openlineage schema contains references to facets' definitions stored in files
91
+ # in facets directory which are not valid schemas for json_schema.
92
+ # In this step we rewrite Openlineage schema to contain facets definitions within it
93
+ def rewrite_schema_to_include_facets(schema)
94
+ # replace all the refs in schema to local refs
95
+ # "facets/ColumnLineageDatasetFacet.json" -> "#/defs/ColumnLineageDatasetFacet"
96
+ schema = schema.gsub(
97
+ /"facets\/([a-zA-Z]+)\.json"/,
98
+ '"#/$defs/\1"'
99
+ )
100
+ schema_json = JSON.parse(schema)
101
+ facets_path = @spec_directory + "facets/"
102
+
103
+
104
+ # list all the facets
105
+ Dir.glob("#{facets_path}/*.json").each { |facet_file|
106
+ facet_schema = JSON.parse(
107
+ File.read(facet_file).gsub(
108
+ /"https:\/\/openlineage\.io\/spec\/\d-\d-\d\/OpenLineage\.json#\/\$defs\/([a-zA-Z]+)"/,
109
+ '"#/$defs/\1"'
110
+ )
111
+ )
112
+
113
+ facet_schema["properties"].each { |property, ref|
114
+ facet_name = ref["$ref"]&.gsub("#/$defs/", "")
115
+ parents = []
116
+ facet_schema["$defs"][facet_name]["allOf"]&.each { |definition|
117
+ unless definition["$ref"].nil?
118
+ parents.append(definition["$ref"].gsub("#/$defs/", ""))
119
+ end
120
+ }
121
+ parents.each {|parent|
122
+ add_ref_as_parent_property(schema_json, parent, facet_name, property)
123
+ }
124
+ }
125
+ # include facets' definitions within schema
126
+ schema_json["$defs"] = schema_json["$defs"].merge(facet_schema["$defs"])
127
+ }
128
+ return schema_json
129
+ end
130
+
131
+ def add_ref_as_parent_property(schema, parent, facet_name, property)
132
+ getter = find_parent_object_getter(parent)
133
+ if getter.nil?
134
+ return
135
+ end
136
+ properties = getter.call(schema)["properties"] || {}
137
+ properties[property] = {"$ref" => "#/$defs/" + facet_name}
138
+ getter.call(schema)["properties"] = properties
139
+ end
140
+
141
+ # Based on facet name find path to object facets
142
+ def find_parent_object_getter(parent)
143
+ getter = nil
144
+ case parent
145
+ when "JobFacet"
146
+ if @validate_job_facets
147
+ getter = ->(schema) { schema["$defs"]["Job"]["properties"]["facets"] }
148
+ end
149
+ when "RunFacet"
150
+ if @validate_run_facets
151
+ getter = ->(schema) { schema["$defs"]["Run"]["properties"]["facets"] }
152
+ end
153
+ when "DatasetFacet"
154
+ if @validate_dataset_facets
155
+ getter = ->(schema) { schema["$defs"]["Dataset"]["properties"]["facets"] }
156
+ end
157
+ when "OutputDatasetFacet"
158
+ if @validate_output_dataset_facets
159
+ getter = ->(schema) { schema \
160
+ ["$defs"] \
161
+ ["OutputDataset"] \
162
+ ["allOf"].select {|el| el.key?("type") } \
163
+ [0] \
164
+ ["properties"] \
165
+ ["outputFacets"] }
166
+ end
167
+ when "InputDatasetFacet"
168
+ if @validate_input_dataset_facets
169
+ getter = ->(schema) { schema \
170
+ ["$defs"] \
171
+ ["InputDataset"] \
172
+ ["allOf"].select {|el| el.key?("type") } \
173
+ [0] \
174
+ ["properties"] \
175
+ ["inputFacets"] }
176
+ end
177
+ end
178
+ return getter
179
+ end
180
+ end
181
+ end
182
+ end
data/misc/fluent.conf ADDED
@@ -0,0 +1,101 @@
1
+ <source>
2
+ @type http
3
+ port 9880
4
+ <parse>
5
+ @type openlineage
6
+ </parse>
7
+ </source>
8
+
9
+
10
+ # https://docs.fluentd.org/output/http
11
+ <match api.v1.lineage> # tag should match fluentd input endpoint url http://localhost:9880/api/v1/lineage
12
+ @type copy
13
+ <store>
14
+ @type http
15
+ endpoint_url "#{ENV['MARQUEZ_HTTP_ENDPOINT']}"
16
+ content_type application/json
17
+ bulk_request false # available since using https://github.com/fluent-plugins-nursery/fluent-plugin-out-http
18
+ buffered true
19
+ serializer json
20
+ retryable_response_codes 408, 429, 500, 502, 503
21
+
22
+ <buffer>
23
+ @type file
24
+ path /tmp/openlineage/buf/chunk-*
25
+ flush_mode immediate
26
+ </buffer>
27
+ </store>
28
+
29
+ <store>
30
+ @type stdout # testing purpose to demonstrate that copy is working
31
+ </store>
32
+
33
+ # other output stores can be put
34
+ </match>
35
+
36
+
37
+ # source for prometheus metrics
38
+ <source>
39
+ @type forward
40
+ bind 0.0.0.0
41
+ port 24224
42
+ </source>
43
+
44
+ # count the number of incoming records per tag
45
+ <filter company.*>
46
+ @type prometheus
47
+ <metric>
48
+ name fluentd_input_status_num_records_total
49
+ type counter
50
+ desc The total number of incoming records
51
+ <labels>
52
+ tag ${tag}
53
+ hostname ${hostname}
54
+ </labels>
55
+ </metric>
56
+ </filter>
57
+
58
+ # count the number of outgoing records per tag
59
+ <match company.*>
60
+ @type copy
61
+
62
+ <store>
63
+ @type forward
64
+ <server>
65
+ name myserver1
66
+ host 192.168.1.3
67
+ port 24224
68
+ weight 60
69
+ </server>
70
+ </store>
71
+
72
+ <store>
73
+ @type prometheus
74
+ <metric>
75
+ name fluentd_output_status_num_records_total
76
+ type counter
77
+ desc The total number of outgoing records
78
+ <labels>
79
+ tag ${tag}
80
+ hostname ${hostname}
81
+ </labels>
82
+ </metric>
83
+ </store>
84
+
85
+ </match>
86
+
87
+ # expose metrics in prometheus format
88
+ <source>
89
+ @type prometheus
90
+ bind 0.0.0.0
91
+ port 24231
92
+ metrics_path /metrics
93
+ </source>
94
+
95
+ <source>
96
+ @type prometheus_output_monitor
97
+ interval 10
98
+ <labels>
99
+ hostname ${hostname}
100
+ </labels>
101
+ </source>
@@ -0,0 +1,73 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2019-05-09T19:50:24.201361Z",
4
+ "run": {
5
+ "runId": "d46e465b-d358-4d32-83d4-df660ff614dd"
6
+ },
7
+ "job": {
8
+ "namespace": "my-namespace",
9
+ "name": "my-job"
10
+ },
11
+ "inputs": [
12
+ {
13
+ "namespace": "my-namespace",
14
+ "name": "my-input",
15
+ "facets": {
16
+ "schema": {
17
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
18
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
19
+ "fields": [
20
+ {
21
+ "name": "a",
22
+ "type": "INTEGER"
23
+ },
24
+ {
25
+ "name": "b",
26
+ "type": "TIMESTAMP"
27
+ },
28
+ {
29
+ "name": "c",
30
+ "type": "INTEGER"
31
+ },
32
+ {
33
+ "name": "d",
34
+ "type": "INTEGER"
35
+ }
36
+ ]
37
+ }
38
+ }
39
+ }
40
+ ],
41
+ "outputs": [
42
+ {
43
+ "namespace": "my-namespace",
44
+ "name": "my-output",
45
+ "facets": {
46
+ "schema": {
47
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
48
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
49
+ "fields": [
50
+ {
51
+ "name": "a",
52
+ "type": "INTEGER"
53
+ },
54
+ {
55
+ "name": "b",
56
+ "type": "TIMESTAMP"
57
+ },
58
+ {
59
+ "name": "c",
60
+ "type": "INTEGER"
61
+ },
62
+ {
63
+ "name": "d",
64
+ "type": "INTEGER"
65
+ }
66
+ ]
67
+ }
68
+ }
69
+ }
70
+ ],
71
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
72
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
73
+ }
@@ -0,0 +1,73 @@
1
+ {
2
+ "eventType": "START",
3
+ "eventTime": "2019-05-09T19:49:24.201361Z",
4
+ "run": {
5
+ "runId": "d46e465b-d358-4d32-83d4-df660ff614dd"
6
+ },
7
+ "job": {
8
+ "namespace": "my-namespace",
9
+ "name": "my-job"
10
+ },
11
+ "inputs": [
12
+ {
13
+ "namespace": "my-namespace",
14
+ "name": "my-input",
15
+ "facets": {
16
+ "schema": {
17
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
18
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
19
+ "fields": [
20
+ {
21
+ "name": "a",
22
+ "type": "INTEGER"
23
+ },
24
+ {
25
+ "name": "b",
26
+ "type": "TIMESTAMP"
27
+ },
28
+ {
29
+ "name": "c",
30
+ "type": "INTEGER"
31
+ },
32
+ {
33
+ "name": "d",
34
+ "type": "INTEGER"
35
+ }
36
+ ]
37
+ }
38
+ }
39
+ }
40
+ ],
41
+ "outputs": [
42
+ {
43
+ "namespace": "my-namespace",
44
+ "name": "my-output",
45
+ "facets": {
46
+ "schema": {
47
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
48
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
49
+ "fields": [
50
+ {
51
+ "name": "a",
52
+ "type": "INTEGER"
53
+ },
54
+ {
55
+ "name": "b",
56
+ "type": "TIMESTAMP"
57
+ },
58
+ {
59
+ "name": "c",
60
+ "type": "INTEGER"
61
+ },
62
+ {
63
+ "name": "d",
64
+ "type": "INTEGER"
65
+ }
66
+ ]
67
+ }
68
+ }
69
+ }
70
+ ],
71
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
72
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
73
+ }