fluentd-openlineage-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. checksums.yaml +7 -0
  2. data/.idea/.gitignore +8 -0
  3. data/.idea/fluentd.iml +204 -0
  4. data/.idea/misc.xml +4 -0
  5. data/.idea/modules/benchmark-memory-0.2.iml +12 -0
  6. data/.idea/modules/bigdecimal-3.1.iml +11 -0
  7. data/.idea/modules/certstore_c-0.1.iml +15 -0
  8. data/.idea/modules/concurrent-ruby-1.3.iml +18 -0
  9. data/.idea/modules/concurrent-ruby-1.31.iml +15 -0
  10. data/.idea/modules/connection_pool-2.4.iml +11 -0
  11. data/.idea/modules/cool.io-1.8.iml +16 -0
  12. data/.idea/modules/drb-2.2.iml +14 -0
  13. data/.idea/modules/drb-2.21.iml +11 -0
  14. data/.idea/modules/ffi-1.17.iml +20 -0
  15. data/.idea/modules/ffi-win32-extensions-1.0.iml +19 -0
  16. data/.idea/modules/fluentd-1.17.iml +43 -0
  17. data/.idea/modules/http_parser.rb-0.8.iml +17 -0
  18. data/.idea/modules/json-2.7.iml +14 -0
  19. data/.idea/modules/json-2.71.iml +11 -0
  20. data/.idea/modules/msgpack-1.7.iml +15 -0
  21. data/.idea/modules/mutex_m-0.2.iml +15 -0
  22. data/.idea/modules/new_gem.iml +15 -0
  23. data/.idea/modules/power_assert-2.0.iml +19 -0
  24. data/.idea/modules/rake-13.2.iml +18 -0
  25. data/.idea/modules/rake-13.21.iml +15 -0
  26. data/.idea/modules/rake-compiler-1.2.iml +13 -0
  27. data/.idea/modules/rusty_json_schema-0.15.iml +15 -0
  28. data/.idea/modules/serverengine-2.3.iml +17 -0
  29. data/.idea/modules/sigdump-0.2.iml +16 -0
  30. data/.idea/modules/specifications.iml +14 -0
  31. data/.idea/modules/specifications1.iml +11 -0
  32. data/.idea/modules/strptime-0.2.iml +16 -0
  33. data/.idea/modules/thermite-0.13.iml +17 -0
  34. data/.idea/modules/webrick-1.8.iml +18 -0
  35. data/.idea/modules/win32-event-0.6.iml +21 -0
  36. data/.idea/modules/win32-ipc-0.7.iml +20 -0
  37. data/.idea/modules/yajl-ruby-1.4.iml +779 -0
  38. data/.idea/modules.xml +41 -0
  39. data/Gemfile +3 -0
  40. data/README.md +223 -0
  41. data/Rakefile +13 -0
  42. data/config/conf/fluent.conf +101 -0
  43. data/config/test-complete.json +73 -0
  44. data/config/test-start.json +73 -0
  45. data/events/event_full.json +206 -0
  46. data/events/event_invalid_dataset_facet.json +31 -0
  47. data/events/event_invalid_input_dataset_facet.json +29 -0
  48. data/events/event_invalid_job_facet.json +26 -0
  49. data/events/event_invalid_output_dataset_facet.json +29 -0
  50. data/events/event_invalid_run_facet.json +28 -0
  51. data/events/event_no_run_id.json +28 -0
  52. data/events/event_simple.json +29 -0
  53. data/fluentd-openlineage-parser.gemspec +28 -0
  54. data/lib/fluent/plugin/fluentd-openlineage-parser.rb +182 -0
  55. data/spec/Naming.md +500 -0
  56. data/spec/OpenLineage.json +304 -0
  57. data/spec/OpenLineage.md +179 -0
  58. data/spec/OpenLineage.yml +27 -0
  59. data/spec/OpenLineageModel.svg +1 -0
  60. data/spec/Versioning.md +49 -0
  61. data/spec/facets/ColumnLineageDatasetFacet.json +96 -0
  62. data/spec/facets/ColumnLineageDatasetFacet.md +106 -0
  63. data/spec/facets/DataQualityAssertionsDatasetFacet.json +49 -0
  64. data/spec/facets/DataQualityMetricsInputDatasetFacet.json +76 -0
  65. data/spec/facets/DatasetVersionDatasetFacet.json +31 -0
  66. data/spec/facets/DatasourceDatasetFacet.json +32 -0
  67. data/spec/facets/DocumentationDatasetFacet.json +31 -0
  68. data/spec/facets/DocumentationJobFacet.json +30 -0
  69. data/spec/facets/ErrorMessageRunFacet.json +41 -0
  70. data/spec/facets/ExternalQueryRunFacet.json +36 -0
  71. data/spec/facets/ExternalQueryRunFacet.md +49 -0
  72. data/spec/facets/ExtractionErrorRunFacet.json +58 -0
  73. data/spec/facets/JobTypeJobFacet.json +41 -0
  74. data/spec/facets/LifecycleStateChangeDatasetFacet.json +46 -0
  75. data/spec/facets/NominalTimeRunFacet.json +38 -0
  76. data/spec/facets/OutputStatisticsOutputDatasetFacet.json +36 -0
  77. data/spec/facets/OwnershipDatasetFacet.json +45 -0
  78. data/spec/facets/OwnershipJobFacet.json +45 -0
  79. data/spec/facets/ParentRunFacet.json +54 -0
  80. data/spec/facets/ProcessingEngineRunFacet.json +41 -0
  81. data/spec/facets/SQLJobFacet.json +30 -0
  82. data/spec/facets/SchemaDatasetFacet.json +59 -0
  83. data/spec/facets/SourceCodeJobFacet.json +34 -0
  84. data/spec/facets/SourceCodeLocationJobFacet.json +60 -0
  85. data/spec/facets/StorageDatasetFacet.json +35 -0
  86. data/spec/facets/SymlinksDatasetFacet.json +47 -0
  87. data/spec/registry/core/registry.json +31 -0
  88. data/spec/registry/gcp/facets/GcpCommonJobFacet.json +43 -0
  89. data/spec/registry/gcp/registry.json +6 -0
  90. data/spec/release.sh +80 -0
  91. data/spec/tests/ColumnLineageDatasetFacet/1.json +172 -0
  92. data/spec/tests/DataQualityAssertionsDatasetFacet/1.json +58 -0
  93. data/spec/tests/DataQualityMetricsInputDatasetFacet/1.json +23 -0
  94. data/spec/tests/DatasetVersionDatasetFacet/1.json +7 -0
  95. data/spec/tests/DatasourceDatasetFacet/1.json +7 -0
  96. data/spec/tests/DocumentationDatasetFacet/1.json +7 -0
  97. data/spec/tests/DocumentationJobFacet/1.json +7 -0
  98. data/spec/tests/ErrorMessageRunFacet/1.json +9 -0
  99. data/spec/tests/ExternalQueryRunFacet/1.json +8 -0
  100. data/spec/tests/ExtractionErrorRunFacet/1.json +15 -0
  101. data/spec/tests/JobTypeJobFacet/1.json +9 -0
  102. data/spec/tests/LifecycleStateChangeDatasetFacet/1.json +11 -0
  103. data/spec/tests/NominalTimeRunFacet/1.json +8 -0
  104. data/spec/tests/OutputStatisticsOutputDatasetFacet/1.json +9 -0
  105. data/spec/tests/OwnershipDatasetFacet/1.json +11 -0
  106. data/spec/tests/OwnershipJobFacet/1.json +11 -0
  107. data/spec/tests/ParentRunFacet/1.json +13 -0
  108. data/spec/tests/ProcessingEngineRunFacet/1.json +9 -0
  109. data/spec/tests/SQLJobFacet/1.json +7 -0
  110. data/spec/tests/SchemaDatasetFacet/1.json +92 -0
  111. data/spec/tests/SourceCodeJobFacet/1.json +8 -0
  112. data/spec/tests/SourceCodeLocationJobFacet/1.json +8 -0
  113. data/spec/tests/StorageDatasetFacet/1.json +8 -0
  114. data/spec/tests/SymlinksDatasetFacet/1.json +13 -0
  115. data/spec/tests/example_full_event.json +24 -0
  116. data/test/helper.rb +8 -0
  117. data/test/plugin/test_parser_openlineage.rb +141 -0
  118. metadata +298 -0
@@ -0,0 +1,206 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:51:01.641Z",
4
+ "run": {
5
+ "runId": "ea041791-68bc-4ae1-bd89-4c8106a157e4",
6
+ "facets": {
7
+ "nominalTime": {
8
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
9
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
10
+ "nominalStartTime": "2020-12-17T03:00:00.001Z",
11
+ "nominalEndTime": "2020-12-17T04:00:00.001Z"
12
+ },
13
+ "parent": {
14
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
15
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
16
+ "run": {
17
+ "runId": "3f5e83fa-3480-44ff-99c5-ff943904e5e8"
18
+ },
19
+ "job": {
20
+ "namespace": "my-scheduler-namespace",
21
+ "name": "myjob.mytask"
22
+ }
23
+ },
24
+ "additionalProp1": {
25
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
26
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
27
+ "additionalProp1": {}
28
+ },
29
+ "additionalProp2": {
30
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
31
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
32
+ "additionalProp1": {}
33
+ },
34
+ "additionalProp3": {
35
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
36
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
37
+ "additionalProp1": {}
38
+ }
39
+ }
40
+ },
41
+ "job": {
42
+ "namespace": "my-scheduler-namespace",
43
+ "name": "myjob.mytask",
44
+ "facets": {
45
+ "documentation": {
46
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
47
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
48
+ "description": "string"
49
+ },
50
+ "sourceCodeLocation": {
51
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
52
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
53
+ "type": "git",
54
+ "url": "http://example.com"
55
+ },
56
+ "sql": {
57
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
58
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
59
+ "additionalPropExample": {
60
+ "example": true
61
+ },
62
+ "query": "SELECT * FROM foo"
63
+ },
64
+ "additionalProp1": {
65
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
66
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
67
+ "additionalProp1": {}
68
+ },
69
+ "additionalProp2": {
70
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
71
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
72
+ "additionalProp1": {}
73
+ },
74
+ "additionalProp3": {
75
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
76
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
77
+ "additionalProp1": {}
78
+ }
79
+ }
80
+ },
81
+ "inputs": [
82
+ {
83
+ "namespace": "my-datasource-namespace",
84
+ "name": "instance.schema.table",
85
+ "inputFacets": {
86
+ "dataQualityMetrics": {
87
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
88
+ "_schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/DataQualityMetricsInputDatasetFacet",
89
+ "rowCount": 1000,
90
+ "bytes": 1048576,
91
+ "fileCount": 5
92
+ },
93
+ "dataQualityAssertions": {
94
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
95
+ "_schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/DataQualityAssertionsDatasetFacet",
96
+ "assertions": [
97
+ {
98
+ "assertion": "row_count_equal_to",
99
+ "success": true
100
+ },
101
+ {
102
+ "assertion": "no_null_values",
103
+ "success": true,
104
+ "column": "id"
105
+ }
106
+ ]
107
+ }
108
+ },
109
+ "facets": {
110
+ "documentation": {
111
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
112
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
113
+ "description": "canonical representation of entity Foo"
114
+ },
115
+ "schema": {
116
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
117
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
118
+ "fields": [
119
+ {
120
+ "name": "column1",
121
+ "type": "VARCHAR",
122
+ "description": "string"
123
+ }
124
+ ]
125
+ },
126
+ "dataSource": {
127
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
128
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
129
+ "name": "string",
130
+ "uri": "string"
131
+ },
132
+ "additionalProp1": {
133
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
134
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
135
+ "additionalProp1": {}
136
+ },
137
+ "additionalProp2": {
138
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
139
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
140
+ "additionalProp1": {}
141
+ },
142
+ "additionalProp3": {
143
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
144
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
145
+ "additionalProp1": {}
146
+ }
147
+ }
148
+ }
149
+ ],
150
+ "outputs": [
151
+ {
152
+ "namespace": "my-datasource-namespace",
153
+ "name": "instance.schema.table",
154
+ "outputFacets": {
155
+ "outputStatistics": {
156
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
157
+ "_schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/OutputStatisticsOutputDatasetFacet",
158
+ "rowCount": 2000,
159
+ "size": 2097152,
160
+ "fileCount": 5
161
+ }
162
+ },
163
+ "facets": {
164
+ "documentation": {
165
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
166
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
167
+ "description": "canonical representation of entity Foo"
168
+ },
169
+ "schema": {
170
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
171
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
172
+ "fields": [
173
+ {
174
+ "name": "column1",
175
+ "type": "VARCHAR",
176
+ "description": "string"
177
+ }
178
+ ]
179
+ },
180
+ "dataSource": {
181
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
182
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
183
+ "name": "string",
184
+ "uri": "string"
185
+ },
186
+ "additionalProp1": {
187
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
188
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
189
+ "additionalProp1": {}
190
+ },
191
+ "additionalProp2": {
192
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
193
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
194
+ "additionalProp1": {}
195
+ },
196
+ "additionalProp3": {
197
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
198
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#MyCustomJobFacet",
199
+ "additionalProp1": {}
200
+ }
201
+ }
202
+ }
203
+ ],
204
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
205
+ "schemaURL": "https://openlineage.io/spec/1-0-5/OpenLineage.json#/definitions/RunEvent"
206
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ "runId": "41fb5137-f0fd-4ee5-ba5c-56f8571d1bd7"
6
+ },
7
+ "job": {
8
+ "namespace": "my-scheduler-namespace",
9
+ "name": "myjob"
10
+ },
11
+ "inputs": [ ],
12
+ "outputs": [
13
+ {
14
+ "namespace": "my-datasource-namespace",
15
+ "name": "instance.schema.output-1",
16
+ "facets": {
17
+ "ownership": {
18
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
19
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#whatever",
20
+ "owners": [
21
+ {
22
+ "no-name": "no-name-owner"
23
+ }
24
+ ]
25
+ }
26
+ }
27
+ }
28
+ ],
29
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
30
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
31
+ }
@@ -0,0 +1,29 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ "runId": "41fb5137-f0fd-4ee5-ba5c-56f8571d1bd7"
6
+ },
7
+ "job": {
8
+ "namespace": "my-scheduler-namespace",
9
+ "name": "myjob"
10
+ },
11
+ "inputs": [
12
+ {
13
+ "namespace": "my-datasource-namespace",
14
+ "name": "instance.schema.input-1",
15
+ "inputFacets": {
16
+ "dataQualityMetrics": {
17
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
18
+ "_schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/DataQualityMetricsInputDatasetFacet",
19
+ "noRowCount": 1000,
20
+ "bytes": 1048576,
21
+ "fileCount": 5
22
+ }
23
+ }
24
+ }
25
+ ],
26
+ "outputs": [],
27
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
28
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
29
+ }
@@ -0,0 +1,26 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ "runId": "41fb5137-f0fd-4ee5-ba5c-56f8571d1bd7"
6
+ },
7
+ "job": {
8
+ "namespace": "my-scheduler-namespace",
9
+ "name": "myjob",
10
+ "facets": {
11
+ "ownership": {
12
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
13
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.yml#whatever",
14
+ "owners": [
15
+ {
16
+ "no-name": "no-name-owner"
17
+ }
18
+ ]
19
+ }
20
+ }
21
+ },
22
+ "inputs": [ ],
23
+ "outputs": [ ],
24
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
25
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
26
+ }
@@ -0,0 +1,29 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ "runId": "41fb5137-f0fd-4ee5-ba5c-56f8571d1bd7"
6
+ },
7
+ "job": {
8
+ "namespace": "my-scheduler-namespace",
9
+ "name": "myjob"
10
+ },
11
+ "inputs": [],
12
+ "outputs": [
13
+ {
14
+ "namespace": "my-datasource-namespace",
15
+ "name": "instance.schema.output-1",
16
+ "outputFacets": {
17
+ "outputStatistics": {
18
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
19
+ "_schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/OutputStatisticsOutputDatasetFacet",
20
+ "rowCount": "wrong",
21
+ "size": 2097152,
22
+ "fileCount": 5
23
+ }
24
+ }
25
+ }
26
+ ],
27
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
28
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
29
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ "runId": "41fb5137-f0fd-4ee5-ba5c-56f8571d1bd7",
6
+ "facets": {
7
+ "parent": {
8
+ "run": {
9
+ "noRunId": "invalid run id"
10
+ },
11
+ "job": {
12
+ "namespace": "parent_namespace",
13
+ "name": "parent_name"
14
+ },
15
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
16
+ "_schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunFacet"
17
+ }
18
+ }
19
+ },
20
+ "job": {
21
+ "namespace": "my-scheduler-namespace",
22
+ "name": "myjob"
23
+ },
24
+ "inputs": [ ],
25
+ "outputs": [ ],
26
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
27
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ },
6
+ "job": {
7
+ "namespace": "my-scheduler-namespace",
8
+ "name": "myjob"
9
+ },
10
+ "inputs": [
11
+ {
12
+ "namespace": "my-datasource-namespace",
13
+ "name": "instance.schema.input-1"
14
+ },
15
+ {
16
+ "namespace": "my-datasource-namespace",
17
+ "name": "instance.schema.input-2"
18
+ }
19
+ ],
20
+ "outputs": [
21
+ {
22
+ "namespace": "my-datasource-namespace",
23
+ "name": "instance.schema.output-1"
24
+ }
25
+ ],
26
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
27
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
28
+ }
@@ -0,0 +1,29 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2020-12-28T19:52:00.001+10:00",
4
+ "run": {
5
+ "runId": "41fb5137-f0fd-4ee5-ba5c-56f8571d1bd7"
6
+ },
7
+ "job": {
8
+ "namespace": "my-scheduler-namespace",
9
+ "name": "myjob"
10
+ },
11
+ "inputs": [
12
+ {
13
+ "namespace": "my-datasource-namespace",
14
+ "name": "instance.schema.input-1"
15
+ },
16
+ {
17
+ "namespace": "my-datasource-namespace",
18
+ "name": "instance.schema.input-2"
19
+ }
20
+ ],
21
+ "outputs": [
22
+ {
23
+ "namespace": "my-datasource-namespace",
24
+ "name": "instance.schema.output-1"
25
+ }
26
+ ],
27
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
28
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
29
+ }
@@ -0,0 +1,28 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ Gem::Specification.new do |spec|
5
+ spec.name = "fluentd-openlineage-parser"
6
+ spec.version = "0.1.0"
7
+ spec.authors = ["Pawel Leszczynski"]
8
+ spec.email = ["leszczynski.pawel@gmail.com"]
9
+
10
+ spec.summary = %q{Parser to validate Openlineage events.}
11
+ spec.description = %q{Fluentd parser that validates if JSON is a valid Openlineage event.}
12
+ spec.homepage = "http://openlineage.io"
13
+ spec.license = "Apache-2.0"
14
+
15
+ test_files, files = `git ls-files -z`.split("\x0").partition do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.files = files
19
+ spec.executables = files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = test_files
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency "test-unit"
26
+ spec.add_runtime_dependency "fluentd", [">= 0.14.10", "< 2"]
27
+ spec.add_dependency "rusty_json_schema"
28
+ end
@@ -0,0 +1,182 @@
1
+ require "fluent/plugin/parser"
2
+ require 'fluent/plugin/parser_json'
3
+ require 'json'
4
+ require "rusty_json_schema"
5
+
6
+ module Fluent
7
+ module Plugin
8
+ class OpenlineageParser < Fluent::Plugin::JSONParser
9
+ Fluent::Plugin.register_parser("openlineage", self)
10
+
11
+ DEFAULT_SPEC_DIRECTORY="/etc/spec"
12
+
13
+ def configure(conf)
14
+ if conf.has_key?('spec_directory')
15
+ @spec_directory = conf['spec_directory']
16
+ else
17
+ @spec_directory = DEFAULT_SPEC_DIRECTORY
18
+ end
19
+ if (not @spec_directory.end_with?("/"))
20
+ @spec_directory += "/"
21
+ end
22
+ @validate_input_dataset_facets = conf.fetch('validate_input_dataset_facets', false)
23
+ @validate_output_dataset_facets = conf.fetch('validate_output_dataset_facets', false)
24
+ @validate_dataset_facets = conf.fetch('validate_dataset_facets', false)
25
+ @validate_run_facets = conf.fetch('validate_run_facets', true)
26
+ @validate_job_facets = conf.fetch('validate_job_facets', true)
27
+ @schema = load_schema()
28
+ @validator = RustyJSONSchema.build(@schema)
29
+ super
30
+ end
31
+
32
+ # https://docs.fluentd.org/plugin-development/api-plugin-parser
33
+ def parse(text)
34
+ # parse JSON with default JSONParser
35
+ super(text) { | time, json |
36
+ validate_openlineage(json)
37
+ yield time, json
38
+ }
39
+ end
40
+
41
+ private
42
+
43
+
44
+ def validate_openlineage(json)
45
+ if json == nil
46
+ raise ParserError, "Openlineage validation failed: invalid json provided"
47
+ end
48
+
49
+ # https://github.com/driv3r/rusty_json_schema
50
+ # Rust json parser ported to ruby that supports Draft 2020-12
51
+ errors = @validator.validate(json)
52
+
53
+ if errors.join(", ").include? "is not valid under any of the given schemas"
54
+ errors = enrich_oneOf_errors(json)
55
+ end
56
+ if !errors.empty?
57
+ raise ParserError, "Openlineage validation failed: " + errors.join(", ")
58
+ end
59
+ end
60
+
61
+ # Validator returns very generic OneOfNotValid error message
62
+ # We try to find better reason for mismatch with each candidate.
63
+ def enrich_oneOf_errors(json)
64
+ errors = []
65
+ @schema["oneOf"].each { |ref|
66
+ changed_schema = @schema
67
+ changed_schema.delete("oneOf")
68
+ changed_schema["$ref"] = ref["$ref"]
69
+ validator = RustyJSONSchema.build(changed_schema)
70
+ error = validator.validate(json)
71
+ if !error.empty?
72
+ errors.append("#{ref}: #{error.join(", ")}")
73
+ end
74
+ }
75
+ return errors
76
+ end
77
+
78
+ def load_schema()
79
+ schemaFile = @spec_directory + "OpenLineage.json"
80
+
81
+ if (not File.exist?(schemaFile))
82
+ raise ParserError, "Couldn't find Openlineage.json file within a defined spec directory: " + schemaFile
83
+ end
84
+
85
+ schema = File.read(schemaFile)
86
+ schema = rewrite_schema_to_include_facets(schema)
87
+ return schema
88
+ end
89
+
90
+ # Current Openlineage schema contains references to facets' definitions stored in files
91
+ # in facets directory which are not valid schemas for json_schema.
92
+ # In this step we rewrite Openlineage schema to contain facets definitions within it
93
+ def rewrite_schema_to_include_facets(schema)
94
+ # replace all the refs in schema to local refs
95
+ # "facets/ColumnLineageDatasetFacet.json" -> "#/defs/ColumnLineageDatasetFacet"
96
+ schema = schema.gsub(
97
+ /"facets\/([a-zA-Z]+)\.json"/,
98
+ '"#/$defs/\1"'
99
+ )
100
+ schema_json = JSON.parse(schema)
101
+ facets_path = @spec_directory + "facets/"
102
+
103
+
104
+ # list all the facets
105
+ Dir.glob("#{facets_path}/*.json").each { |facet_file|
106
+ facet_schema = JSON.parse(
107
+ File.read(facet_file).gsub(
108
+ /"https:\/\/openlineage\.io\/spec\/\d-\d-\d\/OpenLineage\.json#\/\$defs\/([a-zA-Z]+)"/,
109
+ '"#/$defs/\1"'
110
+ )
111
+ )
112
+
113
+ facet_schema["properties"].each { |property, ref|
114
+ facet_name = ref["$ref"]&.gsub("#/$defs/", "")
115
+ parents = []
116
+ facet_schema["$defs"][facet_name]["allOf"]&.each { |definition|
117
+ unless definition["$ref"].nil?
118
+ parents.append(definition["$ref"].gsub("#/$defs/", ""))
119
+ end
120
+ }
121
+ parents.each {|parent|
122
+ add_ref_as_parent_property(schema_json, parent, facet_name, property)
123
+ }
124
+ }
125
+ # include facets' definitions within schema
126
+ schema_json["$defs"] = schema_json["$defs"].merge(facet_schema["$defs"])
127
+ }
128
+ return schema_json
129
+ end
130
+
131
+ def add_ref_as_parent_property(schema, parent, facet_name, property)
132
+ getter = find_parent_object_getter(parent)
133
+ if getter.nil?
134
+ return
135
+ end
136
+ properties = getter.call(schema)["properties"] || {}
137
+ properties[property] = {"$ref" => "#/$defs/" + facet_name}
138
+ getter.call(schema)["properties"] = properties
139
+ end
140
+
141
+ # Based on facet name find path to object facets
142
+ def find_parent_object_getter(parent)
143
+ getter = nil
144
+ case parent
145
+ when "JobFacet"
146
+ if @validate_job_facets
147
+ getter = ->(schema) { schema["$defs"]["Job"]["properties"]["facets"] }
148
+ end
149
+ when "RunFacet"
150
+ if @validate_run_facets
151
+ getter = ->(schema) { schema["$defs"]["Run"]["properties"]["facets"] }
152
+ end
153
+ when "DatasetFacet"
154
+ if @validate_dataset_facets
155
+ getter = ->(schema) { schema["$defs"]["Dataset"]["properties"]["facets"] }
156
+ end
157
+ when "OutputDatasetFacet"
158
+ if @validate_output_dataset_facets
159
+ getter = ->(schema) { schema \
160
+ ["$defs"] \
161
+ ["OutputDataset"] \
162
+ ["allOf"].select {|el| el.key?("type") } \
163
+ [0] \
164
+ ["properties"] \
165
+ ["outputFacets"] }
166
+ end
167
+ when "InputDatasetFacet"
168
+ if @validate_input_dataset_facets
169
+ getter = ->(schema) { schema \
170
+ ["$defs"] \
171
+ ["InputDataset"] \
172
+ ["allOf"].select {|el| el.key?("type") } \
173
+ [0] \
174
+ ["properties"] \
175
+ ["inputFacets"] }
176
+ end
177
+ end
178
+ return getter
179
+ end
180
+ end
181
+ end
182
+ end