fluentd-openlineage-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. checksums.yaml +7 -0
  2. data/.idea/.gitignore +8 -0
  3. data/.idea/fluentd.iml +204 -0
  4. data/.idea/misc.xml +4 -0
  5. data/.idea/modules/benchmark-memory-0.2.iml +12 -0
  6. data/.idea/modules/bigdecimal-3.1.iml +11 -0
  7. data/.idea/modules/certstore_c-0.1.iml +15 -0
  8. data/.idea/modules/concurrent-ruby-1.3.iml +18 -0
  9. data/.idea/modules/concurrent-ruby-1.31.iml +15 -0
  10. data/.idea/modules/connection_pool-2.4.iml +11 -0
  11. data/.idea/modules/cool.io-1.8.iml +16 -0
  12. data/.idea/modules/drb-2.2.iml +14 -0
  13. data/.idea/modules/drb-2.21.iml +11 -0
  14. data/.idea/modules/ffi-1.17.iml +20 -0
  15. data/.idea/modules/ffi-win32-extensions-1.0.iml +19 -0
  16. data/.idea/modules/fluentd-1.17.iml +43 -0
  17. data/.idea/modules/http_parser.rb-0.8.iml +17 -0
  18. data/.idea/modules/json-2.7.iml +14 -0
  19. data/.idea/modules/json-2.71.iml +11 -0
  20. data/.idea/modules/msgpack-1.7.iml +15 -0
  21. data/.idea/modules/mutex_m-0.2.iml +15 -0
  22. data/.idea/modules/new_gem.iml +15 -0
  23. data/.idea/modules/power_assert-2.0.iml +19 -0
  24. data/.idea/modules/rake-13.2.iml +18 -0
  25. data/.idea/modules/rake-13.21.iml +15 -0
  26. data/.idea/modules/rake-compiler-1.2.iml +13 -0
  27. data/.idea/modules/rusty_json_schema-0.15.iml +15 -0
  28. data/.idea/modules/serverengine-2.3.iml +17 -0
  29. data/.idea/modules/sigdump-0.2.iml +16 -0
  30. data/.idea/modules/specifications.iml +14 -0
  31. data/.idea/modules/specifications1.iml +11 -0
  32. data/.idea/modules/strptime-0.2.iml +16 -0
  33. data/.idea/modules/thermite-0.13.iml +17 -0
  34. data/.idea/modules/webrick-1.8.iml +18 -0
  35. data/.idea/modules/win32-event-0.6.iml +21 -0
  36. data/.idea/modules/win32-ipc-0.7.iml +20 -0
  37. data/.idea/modules/yajl-ruby-1.4.iml +779 -0
  38. data/.idea/modules.xml +41 -0
  39. data/Gemfile +3 -0
  40. data/README.md +223 -0
  41. data/Rakefile +13 -0
  42. data/config/conf/fluent.conf +101 -0
  43. data/config/test-complete.json +73 -0
  44. data/config/test-start.json +73 -0
  45. data/events/event_full.json +206 -0
  46. data/events/event_invalid_dataset_facet.json +31 -0
  47. data/events/event_invalid_input_dataset_facet.json +29 -0
  48. data/events/event_invalid_job_facet.json +26 -0
  49. data/events/event_invalid_output_dataset_facet.json +29 -0
  50. data/events/event_invalid_run_facet.json +28 -0
  51. data/events/event_no_run_id.json +28 -0
  52. data/events/event_simple.json +29 -0
  53. data/fluentd-openlineage-parser.gemspec +28 -0
  54. data/lib/fluent/plugin/fluentd-openlineage-parser.rb +182 -0
  55. data/spec/Naming.md +500 -0
  56. data/spec/OpenLineage.json +304 -0
  57. data/spec/OpenLineage.md +179 -0
  58. data/spec/OpenLineage.yml +27 -0
  59. data/spec/OpenLineageModel.svg +1 -0
  60. data/spec/Versioning.md +49 -0
  61. data/spec/facets/ColumnLineageDatasetFacet.json +96 -0
  62. data/spec/facets/ColumnLineageDatasetFacet.md +106 -0
  63. data/spec/facets/DataQualityAssertionsDatasetFacet.json +49 -0
  64. data/spec/facets/DataQualityMetricsInputDatasetFacet.json +76 -0
  65. data/spec/facets/DatasetVersionDatasetFacet.json +31 -0
  66. data/spec/facets/DatasourceDatasetFacet.json +32 -0
  67. data/spec/facets/DocumentationDatasetFacet.json +31 -0
  68. data/spec/facets/DocumentationJobFacet.json +30 -0
  69. data/spec/facets/ErrorMessageRunFacet.json +41 -0
  70. data/spec/facets/ExternalQueryRunFacet.json +36 -0
  71. data/spec/facets/ExternalQueryRunFacet.md +49 -0
  72. data/spec/facets/ExtractionErrorRunFacet.json +58 -0
  73. data/spec/facets/JobTypeJobFacet.json +41 -0
  74. data/spec/facets/LifecycleStateChangeDatasetFacet.json +46 -0
  75. data/spec/facets/NominalTimeRunFacet.json +38 -0
  76. data/spec/facets/OutputStatisticsOutputDatasetFacet.json +36 -0
  77. data/spec/facets/OwnershipDatasetFacet.json +45 -0
  78. data/spec/facets/OwnershipJobFacet.json +45 -0
  79. data/spec/facets/ParentRunFacet.json +54 -0
  80. data/spec/facets/ProcessingEngineRunFacet.json +41 -0
  81. data/spec/facets/SQLJobFacet.json +30 -0
  82. data/spec/facets/SchemaDatasetFacet.json +59 -0
  83. data/spec/facets/SourceCodeJobFacet.json +34 -0
  84. data/spec/facets/SourceCodeLocationJobFacet.json +60 -0
  85. data/spec/facets/StorageDatasetFacet.json +35 -0
  86. data/spec/facets/SymlinksDatasetFacet.json +47 -0
  87. data/spec/registry/core/registry.json +31 -0
  88. data/spec/registry/gcp/facets/GcpCommonJobFacet.json +43 -0
  89. data/spec/registry/gcp/registry.json +6 -0
  90. data/spec/release.sh +80 -0
  91. data/spec/tests/ColumnLineageDatasetFacet/1.json +172 -0
  92. data/spec/tests/DataQualityAssertionsDatasetFacet/1.json +58 -0
  93. data/spec/tests/DataQualityMetricsInputDatasetFacet/1.json +23 -0
  94. data/spec/tests/DatasetVersionDatasetFacet/1.json +7 -0
  95. data/spec/tests/DatasourceDatasetFacet/1.json +7 -0
  96. data/spec/tests/DocumentationDatasetFacet/1.json +7 -0
  97. data/spec/tests/DocumentationJobFacet/1.json +7 -0
  98. data/spec/tests/ErrorMessageRunFacet/1.json +9 -0
  99. data/spec/tests/ExternalQueryRunFacet/1.json +8 -0
  100. data/spec/tests/ExtractionErrorRunFacet/1.json +15 -0
  101. data/spec/tests/JobTypeJobFacet/1.json +9 -0
  102. data/spec/tests/LifecycleStateChangeDatasetFacet/1.json +11 -0
  103. data/spec/tests/NominalTimeRunFacet/1.json +8 -0
  104. data/spec/tests/OutputStatisticsOutputDatasetFacet/1.json +9 -0
  105. data/spec/tests/OwnershipDatasetFacet/1.json +11 -0
  106. data/spec/tests/OwnershipJobFacet/1.json +11 -0
  107. data/spec/tests/ParentRunFacet/1.json +13 -0
  108. data/spec/tests/ProcessingEngineRunFacet/1.json +9 -0
  109. data/spec/tests/SQLJobFacet/1.json +7 -0
  110. data/spec/tests/SchemaDatasetFacet/1.json +92 -0
  111. data/spec/tests/SourceCodeJobFacet/1.json +8 -0
  112. data/spec/tests/SourceCodeLocationJobFacet/1.json +8 -0
  113. data/spec/tests/StorageDatasetFacet/1.json +8 -0
  114. data/spec/tests/SymlinksDatasetFacet/1.json +13 -0
  115. data/spec/tests/example_full_event.json +24 -0
  116. data/test/helper.rb +8 -0
  117. data/test/plugin/test_parser_openlineage.rb +141 -0
  118. metadata +298 -0
data/.idea/modules.xml ADDED
@@ -0,0 +1,41 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/benchmark-memory-0.2.iml" filepath="$PROJECT_DIR$/.idea/modules/benchmark-memory-0.2.iml" />
6
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/bigdecimal-3.1.iml" filepath="$PROJECT_DIR$/.idea/modules/bigdecimal-3.1.iml" />
7
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/certstore_c-0.1.iml" filepath="$PROJECT_DIR$/.idea/modules/certstore_c-0.1.iml" />
8
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/concurrent-ruby-1.3.iml" filepath="$PROJECT_DIR$/.idea/modules/concurrent-ruby-1.3.iml" />
9
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/concurrent-ruby-1.31.iml" filepath="$PROJECT_DIR$/.idea/modules/concurrent-ruby-1.31.iml" />
10
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/connection_pool-2.4.iml" filepath="$PROJECT_DIR$/.idea/modules/connection_pool-2.4.iml" />
11
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/cool.io-1.8.iml" filepath="$PROJECT_DIR$/.idea/modules/cool.io-1.8.iml" />
12
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/drb-2.2.iml" filepath="$PROJECT_DIR$/.idea/modules/drb-2.2.iml" />
13
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/drb-2.21.iml" filepath="$PROJECT_DIR$/.idea/modules/drb-2.21.iml" />
14
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/ffi-1.17.iml" filepath="$PROJECT_DIR$/.idea/modules/ffi-1.17.iml" />
15
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/ffi-win32-extensions-1.0.iml" filepath="$PROJECT_DIR$/.idea/modules/ffi-win32-extensions-1.0.iml" />
16
+ <module fileurl="file://$PROJECT_DIR$/.idea/fluentd.iml" filepath="$PROJECT_DIR$/.idea/fluentd.iml" />
17
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/fluentd-1.17.iml" filepath="$PROJECT_DIR$/.idea/modules/fluentd-1.17.iml" />
18
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/http_parser.rb-0.8.iml" filepath="$PROJECT_DIR$/.idea/modules/http_parser.rb-0.8.iml" />
19
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/json-2.7.iml" filepath="$PROJECT_DIR$/.idea/modules/json-2.7.iml" />
20
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/json-2.71.iml" filepath="$PROJECT_DIR$/.idea/modules/json-2.71.iml" />
21
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/msgpack-1.7.iml" filepath="$PROJECT_DIR$/.idea/modules/msgpack-1.7.iml" />
22
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/mutex_m-0.2.iml" filepath="$PROJECT_DIR$/.idea/modules/mutex_m-0.2.iml" />
23
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/new_gem.iml" filepath="$PROJECT_DIR$/.idea/modules/new_gem.iml" />
24
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/power_assert-2.0.iml" filepath="$PROJECT_DIR$/.idea/modules/power_assert-2.0.iml" />
25
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/rake-13.2.iml" filepath="$PROJECT_DIR$/.idea/modules/rake-13.2.iml" />
26
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/rake-13.21.iml" filepath="$PROJECT_DIR$/.idea/modules/rake-13.21.iml" />
27
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/rake-compiler-1.2.iml" filepath="$PROJECT_DIR$/.idea/modules/rake-compiler-1.2.iml" />
28
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/rusty_json_schema-0.15.iml" filepath="$PROJECT_DIR$/.idea/modules/rusty_json_schema-0.15.iml" />
29
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/serverengine-2.3.iml" filepath="$PROJECT_DIR$/.idea/modules/serverengine-2.3.iml" />
30
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/sigdump-0.2.iml" filepath="$PROJECT_DIR$/.idea/modules/sigdump-0.2.iml" />
31
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/specifications.iml" filepath="$PROJECT_DIR$/.idea/modules/specifications.iml" />
32
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/specifications1.iml" filepath="$PROJECT_DIR$/.idea/modules/specifications1.iml" />
33
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/strptime-0.2.iml" filepath="$PROJECT_DIR$/.idea/modules/strptime-0.2.iml" />
34
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/thermite-0.13.iml" filepath="$PROJECT_DIR$/.idea/modules/thermite-0.13.iml" />
35
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/webrick-1.8.iml" filepath="$PROJECT_DIR$/.idea/modules/webrick-1.8.iml" />
36
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/win32-event-0.6.iml" filepath="$PROJECT_DIR$/.idea/modules/win32-event-0.6.iml" />
37
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/win32-ipc-0.7.iml" filepath="$PROJECT_DIR$/.idea/modules/win32-ipc-0.7.iml" />
38
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/yajl-ruby-1.4.iml" filepath="$PROJECT_DIR$/.idea/modules/yajl-ruby-1.4.iml" />
39
+ </modules>
40
+ </component>
41
+ </project>
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,223 @@
1
+ # Fluentd and Openlineage
2
+
3
+ ## Why are Fluentd and Openlineage a perfect match?
4
+
5
+ **Fluentd support is experimental and could be changed or removed in a future release.**
6
+
7
+ Modern data collectors (Fluentd, Logstash, Vector, etc.) can be extremely useful when designing
8
+ production-grade architectures for processing Openlineage events.
9
+
10
+ They can be used for features such as:
11
+ * A server-proxy in front of the Openlineage backend (like Marquez) to handle load spikes and buffer incoming events when the backend is down (e.g., due to a maintenance window).
12
+ * The ability to copy the event to multiple backends such as HTTP, Kafka or cloud object storage. Data collectors implement that out-of-the-box.
13
+
14
+ They have great potential except for a single missing feature: *the ability to parse and validate OpenLineage events at the point of HTTP input*.
15
+ This is important as one would like to get a `Bad Request` response immediately when sending invalid OpenLineage events to an endpoint.
16
+ Fortunately, this missing feature can be implemented as a plugin.
17
+
18
+ We decided to implement an OpenLineage parser plugin for Fluentd because:
19
+ * Fluentd has a small footprint in terms of resource utilization and does not require that JVM be installed,
20
+ * Fluentd plugins can be installed from local files (no need to register in a plugin repository).
21
+
22
+ As a side effect, the Fluentd integration can be also used as a OpenLineage HTTP validation backend for
23
+ development purposes.
24
+
25
+ ## Fluentd features
26
+
27
+ Some interesting Fluentd features are available according to the [official documentation](https://docs.fluentd.org/):
28
+
29
+ * [Buffering/retrying parameters](https://docs.fluentd.org/output#buffering-retrying-parameters),
30
+ * Useful output plugins:
31
+ * [Output Kafka plugin](https://docs.fluentd.org/output/kafka),
32
+ * [Output S3 plugin](https://docs.fluentd.org/output/s3),
33
+ * [Output copy plugin](https://docs.fluentd.org/output/copy),
34
+ * [Output HTTP plugin](https://docs.fluentd.org/output/http) with options such as [retryable_response_codes](https://docs.fluentd.org/output/http#retryable_response_codes) to specify backend codes that should cause a retry,
35
+ * [Buffer configuration](https://docs.fluentd.org/configuration/buffer-section),
36
+ * [Embedding Ruby Expressions in config files to contain environment variables](https://docs.fluentd.org/configuration/config-file#embedding-ruby-expressions).
37
+
38
+ The official Fluentd documentation does not mention guarantees about event ordering. However, retrieving
39
+ Openlineage events and buffering in file/memory should be considered a millisecond-long operation,
40
+ while any HTTP backend cannot guarantee ordering in such a case. On the other hand, by default
41
+ the amount of threads to flush the buffer is set to 1 and configurable ([flush_thread_count](https://docs.fluentd.org/output#flush_thread_count)).
42
+
43
+ ## Quickstart with Docker
44
+
45
+ Please refer to the [`Dockerfile`](docker/Dockerfile) and [`fluent.conf`](docker/conf/fluent.conf) to see how to build and install the plugin with
46
+ the example usage scenario provided in [`docker-compose.yml`](docker/docker-compose.yml). To run the example setup, go to the `docker` directory and execute the following command:
47
+
48
+ ```shell
49
+ docker-compose up
50
+ ```
51
+
52
+ After all the containers have started, send some HTTP requests:
53
+
54
+ ```shell
55
+ curl -X POST \
56
+ -d '{"test":"test"}' \
57
+ -H 'Content-Type: application/json' \
58
+ http://localhost:9880/api/v1/lineage
59
+ ```
60
+ In response, you should see the following message:
61
+
62
+ `Openlineage validation failed: path "/": "run" is a required property, path "/": "job" is a required property, path "/": "eventTime" is a required property, path "/": "producer" is a required property, path "/": "schemaURL" is a required property`
63
+
64
+ Next, send some valid requests:
65
+
66
+ ```shell
67
+ curl -X POST \
68
+ -d "$(cat test-start.json)" \
69
+ -H 'Content-Type: application/json' \
70
+ http://localhost:9880/api/v1/lineage
71
+ ```
72
+
73
+ ```shell
74
+ curl -X POST \
75
+ -d "$(cat test-complete.json)" \
76
+ -H 'Content-Type: application/json' \
77
+ http://localhost:9880/api/v1/lineage
78
+ ```
79
+
80
+ After that you should see entities in Marquez (http://localhost:3000/) in the `my-namespace` namespace.
81
+
82
+ To clean up, run
83
+ ```shell
84
+ docker-compose down
85
+ ```
86
+
87
+ ## Deployment on Kubernetes
88
+
89
+ ***Section under construction***
90
+
91
+ ## Parser plugin
92
+
93
+ Openlineage-parser is a Fluentd plugin that verifies if a JSON matches the OpenLineage schema.
94
+
95
+ ### Configuration
96
+
97
+ Although Openlineage event is specified according to Json-Schema, its real-life validation may
98
+ vary and backends like Marquez may have less strict approach to validating certain types of facets.
99
+ For example, Marquez allows a non-valid `DataQualityMetricsInputDatasetFacet`.
100
+ To give more flexibility, fluentd parser allows following configuration parameters:
101
+ ```ruby
102
+ validate_input_dataset_facets => true/false
103
+ validate_output_dataset_facets => true/false
104
+ validate_dataset_facets => true/false
105
+ validate_run_facets => true/false
106
+ validate_job_facets => true/false
107
+ ```
108
+ By default, only `validate_run_facets` and `validate_job_facets` are set to `true`/
109
+
110
+ ### Development
111
+
112
+ To build dependencies:
113
+ ```shell
114
+ bundle install
115
+ bundle
116
+ ```
117
+
118
+ To run the tests:
119
+ ```shell
120
+ bundle exec rake test
121
+ ```
122
+
123
+ #### Installation
124
+
125
+ The easiest way to install the plugin is to install external packages:
126
+ * `rusty_json_schema` installs a JSON validation library for Rust,
127
+ * `fluent-plugin-out-http` allows non-bulk HTTP out requests (sending each OpenLineage event in a separate request).
128
+ ```shell
129
+ fluent-gem install rusty_json_schema
130
+ fluent-gem install fluent-plugin-out-http
131
+ ```
132
+ Once the external dependencies are installed, a single Ruby code file `parser_openlineage.rb` needs
133
+ to be copied into the Fluentd plugins directory ([installing custom plugin](https://docs.fluentd.org/plugin-development#installing-custom-plugins)).
134
+
135
+ ## Fluentd proxy setup
136
+ ### Monitoring with Prometheus
137
+
138
+ The information above, provided you with valuable information on how to use this plugin (Yes, this is a plugin, you will still need the main Fluentd application to run it!), you may also want to check how Fluentd application itself is doing using Prometheus and for that, you may want to add the plugin: fluent-plugin-prometheus at https://github.com/fluent/fluent-plugin-prometheus and include the following setup in your prometheus.yml file:
139
+
140
+ ```yml
141
+ global:
142
+ scrape_interval: 10s # Set the scrape interval to every 10 seconds. Default is every 1 minute.
143
+
144
+ #### A scrape configuration containing exactly one endpoint to scrape:
145
+ #### Here it's Prometheus itself.
146
+ scrape_configs:
147
+ - job_name: 'fluentd'
148
+ static_configs:
149
+ - targets: ['localhost:24231']
150
+ ````
151
+
152
+ You may also want to include the following additional parameters to your fluent.conf file:
153
+
154
+ ```xml
155
+ #### source
156
+ <source>
157
+ @type forward
158
+ bind 0.0.0.0
159
+ port 24224
160
+ </source>
161
+
162
+ #### count the number of incoming records per tag
163
+ <filter company.*>
164
+ @type prometheus
165
+ <metric>
166
+ name fluentd_input_status_num_records_total
167
+ type counter
168
+ desc The total number of incoming records
169
+ <labels>
170
+ tag ${tag}
171
+ hostname ${hostname}
172
+ </labels>
173
+ </metric>
174
+ </filter>
175
+
176
+ #### count the number of outgoing records per tag
177
+ <match company.*>
178
+ @type copy
179
+
180
+ <store>
181
+ @type forward
182
+ <server>
183
+ name myserver1
184
+ host 192.168.1.3
185
+ port 24224
186
+ weight 60
187
+ </server>
188
+ </store>
189
+
190
+ <store>
191
+ @type prometheus
192
+ <metric>
193
+ name fluentd_output_status_num_records_total
194
+ type counter
195
+ desc The total number of outgoing records
196
+ <labels>
197
+ tag ${tag}
198
+ hostname ${hostname}
199
+ </labels>
200
+ </metric>
201
+ </store>
202
+
203
+ </match>
204
+
205
+ #### expose metrics in prometheus format
206
+
207
+ <source>
208
+ @type prometheus
209
+ bind 0.0.0.0
210
+ port 24231
211
+ metrics_path /metrics
212
+ </source>
213
+
214
+ <source>
215
+ @type prometheus_output_monitor
216
+ interval 10
217
+ <labels>
218
+ hostname ${hostname}
219
+ </labels>
220
+ </source>
221
+ ```
222
+
223
+ For any additional information, you can check out Fluentd official documentation on https://docs.fluentd.org/monitoring-fluentd/monitoring-prometheus#example-prometheus-queries# fluentd-openlineage-parser
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require "bundler"
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs.push("lib", "test")
8
+ t.test_files = FileList["test/**/test_*.rb"]
9
+ t.verbose = true
10
+ t.warning = true
11
+ end
12
+
13
+ task default: [:test]
@@ -0,0 +1,101 @@
1
+ <source>
2
+ @type http
3
+ port 9880
4
+ <parse>
5
+ @type openlineage
6
+ </parse>
7
+ </source>
8
+
9
+
10
+ # https://docs.fluentd.org/output/http
11
+ <match api.v1.lineage> # tag should match fluentd input endpoint url http://localhost:9880/api/v1/lineage
12
+ @type copy
13
+ <store>
14
+ @type http
15
+ endpoint_url "#{ENV['MARQUEZ_HTTP_ENDPOINT']}"
16
+ content_type application/json
17
+ bulk_request false # available since using https://github.com/fluent-plugins-nursery/fluent-plugin-out-http
18
+ buffered true
19
+ serializer json
20
+ retryable_response_codes 408, 429, 500, 502, 503
21
+
22
+ <buffer>
23
+ @type file
24
+ path /tmp/openlineage/buf/chunk-*
25
+ flush_mode immediate
26
+ </buffer>
27
+ </store>
28
+
29
+ <store>
30
+ @type stdout # testing purpose to demonstrate that copy is working
31
+ </store>
32
+
33
+ # other output stores can be put
34
+ </match>
35
+
36
+
37
+ # source for prometheus metrics
38
+ <source>
39
+ @type forward
40
+ bind 0.0.0.0
41
+ port 24224
42
+ </source>
43
+
44
+ # count the number of incoming records per tag
45
+ <filter company.*>
46
+ @type prometheus
47
+ <metric>
48
+ name fluentd_input_status_num_records_total
49
+ type counter
50
+ desc The total number of incoming records
51
+ <labels>
52
+ tag ${tag}
53
+ hostname ${hostname}
54
+ </labels>
55
+ </metric>
56
+ </filter>
57
+
58
+ # count the number of outgoing records per tag
59
+ <match company.*>
60
+ @type copy
61
+
62
+ <store>
63
+ @type forward
64
+ <server>
65
+ name myserver1
66
+ host 192.168.1.3
67
+ port 24224
68
+ weight 60
69
+ </server>
70
+ </store>
71
+
72
+ <store>
73
+ @type prometheus
74
+ <metric>
75
+ name fluentd_output_status_num_records_total
76
+ type counter
77
+ desc The total number of outgoing records
78
+ <labels>
79
+ tag ${tag}
80
+ hostname ${hostname}
81
+ </labels>
82
+ </metric>
83
+ </store>
84
+
85
+ </match>
86
+
87
+ # expose metrics in prometheus format
88
+ <source>
89
+ @type prometheus
90
+ bind 0.0.0.0
91
+ port 24231
92
+ metrics_path /metrics
93
+ </source>
94
+
95
+ <source>
96
+ @type prometheus_output_monitor
97
+ interval 10
98
+ <labels>
99
+ hostname ${hostname}
100
+ </labels>
101
+ </source>
@@ -0,0 +1,73 @@
1
+ {
2
+ "eventType": "COMPLETE",
3
+ "eventTime": "2019-05-09T19:50:24.201361Z",
4
+ "run": {
5
+ "runId": "d46e465b-d358-4d32-83d4-df660ff614dd"
6
+ },
7
+ "job": {
8
+ "namespace": "my-namespace",
9
+ "name": "my-job"
10
+ },
11
+ "inputs": [
12
+ {
13
+ "namespace": "my-namespace",
14
+ "name": "my-input",
15
+ "facets": {
16
+ "schema": {
17
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
18
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
19
+ "fields": [
20
+ {
21
+ "name": "a",
22
+ "type": "INTEGER"
23
+ },
24
+ {
25
+ "name": "b",
26
+ "type": "TIMESTAMP"
27
+ },
28
+ {
29
+ "name": "c",
30
+ "type": "INTEGER"
31
+ },
32
+ {
33
+ "name": "d",
34
+ "type": "INTEGER"
35
+ }
36
+ ]
37
+ }
38
+ }
39
+ }
40
+ ],
41
+ "outputs": [
42
+ {
43
+ "namespace": "my-namespace",
44
+ "name": "my-output",
45
+ "facets": {
46
+ "schema": {
47
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
48
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
49
+ "fields": [
50
+ {
51
+ "name": "a",
52
+ "type": "INTEGER"
53
+ },
54
+ {
55
+ "name": "b",
56
+ "type": "TIMESTAMP"
57
+ },
58
+ {
59
+ "name": "c",
60
+ "type": "INTEGER"
61
+ },
62
+ {
63
+ "name": "d",
64
+ "type": "INTEGER"
65
+ }
66
+ ]
67
+ }
68
+ }
69
+ }
70
+ ],
71
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
72
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
73
+ }
@@ -0,0 +1,73 @@
1
+ {
2
+ "eventType": "START",
3
+ "eventTime": "2019-05-09T19:49:24.201361Z",
4
+ "run": {
5
+ "runId": "d46e465b-d358-4d32-83d4-df660ff614dd"
6
+ },
7
+ "job": {
8
+ "namespace": "my-namespace",
9
+ "name": "my-job"
10
+ },
11
+ "inputs": [
12
+ {
13
+ "namespace": "my-namespace",
14
+ "name": "my-input",
15
+ "facets": {
16
+ "schema": {
17
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
18
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
19
+ "fields": [
20
+ {
21
+ "name": "a",
22
+ "type": "INTEGER"
23
+ },
24
+ {
25
+ "name": "b",
26
+ "type": "TIMESTAMP"
27
+ },
28
+ {
29
+ "name": "c",
30
+ "type": "INTEGER"
31
+ },
32
+ {
33
+ "name": "d",
34
+ "type": "INTEGER"
35
+ }
36
+ ]
37
+ }
38
+ }
39
+ }
40
+ ],
41
+ "outputs": [
42
+ {
43
+ "namespace": "my-namespace",
44
+ "name": "my-output",
45
+ "facets": {
46
+ "schema": {
47
+ "_producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
48
+ "_schemaURL": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/spec/OpenLineage.json#/definitions/SchemaDatasetFacet",
49
+ "fields": [
50
+ {
51
+ "name": "a",
52
+ "type": "INTEGER"
53
+ },
54
+ {
55
+ "name": "b",
56
+ "type": "TIMESTAMP"
57
+ },
58
+ {
59
+ "name": "c",
60
+ "type": "INTEGER"
61
+ },
62
+ {
63
+ "name": "d",
64
+ "type": "INTEGER"
65
+ }
66
+ ]
67
+ }
68
+ }
69
+ }
70
+ ],
71
+ "producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client",
72
+ "schemaURL": "https://openlineage.io/spec/1-0-1/OpenLineage.json#/definitions/RunEvent"
73
+ }