embulk-parser-xml2 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,240 @@
1
+ /**
2
+ * The MIT License (MIT)
3
+ *
4
+ * Copyright (C) 2016 Yahoo Japan Corporation. All Rights Reserved.
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ * of this software and associated documentation files (the "Software"), to deal
8
+ * in the Software without restriction, including without limitation the rights
9
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the Software is
11
+ * furnished to do so, subject to the following conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be included in
14
+ * all copies or substantial portions of the Software.
15
+ *
16
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ * THE SOFTWARE.
23
+ */
24
+
25
+ package org.embulk.parser.xml2;
26
+
27
+ import static org.junit.Assert.*;
28
+
29
+ import java.io.File;
30
+ import java.io.FileInputStream;
31
+ import java.io.FileNotFoundException;
32
+ import java.util.HashMap;
33
+ import java.util.LinkedList;
34
+ import java.util.List;
35
+ import java.util.Map;
36
+
37
+ import org.embulk.EmbulkTestRuntime;
38
+ import org.embulk.config.ConfigSource;
39
+ import org.embulk.config.TaskReport;
40
+ import org.embulk.config.TaskSource;
41
+ import org.embulk.parser.xml2.Xml2ParserPlugin.PluginTask;
42
+ import org.embulk.spi.Column;
43
+ import org.embulk.spi.ColumnVisitor;
44
+ import org.embulk.spi.Exec;
45
+ import org.embulk.spi.Page;
46
+ import org.embulk.spi.PageReader;
47
+ import org.embulk.spi.ParserPlugin;
48
+ import org.embulk.spi.Schema;
49
+ import org.embulk.spi.TransactionalPageOutput;
50
+ import org.embulk.spi.time.Timestamp;
51
+ import org.embulk.spi.util.InputStreamFileInput;
52
+ import org.junit.Before;
53
+ import org.junit.BeforeClass;
54
+ import org.junit.Rule;
55
+ import org.junit.Test;
56
+
57
+ import com.google.common.collect.ImmutableList;
58
+ import com.google.common.collect.ImmutableMap;
59
+
60
+ public class TestXml2ParserPlugin {
61
+ @Rule
62
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
63
+ private Xml2ParserPlugin plugin;
64
+
65
+ private static String PATH_PREFIX;
66
+
67
+ @BeforeClass
68
+ public static void initializeConstant() {
69
+ PATH_PREFIX = Xml2ParserPlugin.class.getClassLoader().getResource("sample_01.xml").getPath();
70
+ }
71
+
72
+ @Before
73
+ public void createResource() {
74
+ plugin = new Xml2ParserPlugin();
75
+ }
76
+
77
+ @Test
78
+ public void testTransaction() {
79
+ ConfigSource config = config();
80
+ plugin.transaction(config, new ParserPlugin.Control() {
81
+ @Override
82
+ public void run(TaskSource taskSource, Schema schema) {
83
+ }
84
+ });
85
+ }
86
+
87
+ @Test
88
+ public void testFile() throws FileNotFoundException {
89
+
90
+ ConfigSource config = config();
91
+ final Schema schema = config.loadConfig(Xml2ParserPlugin.PluginTask.class).getSchema().toSchema();
92
+ PluginTask task = config.loadConfig(PluginTask.class);
93
+ plugin.transaction(config, new ParserPlugin.Control() {
94
+ @Override
95
+ public void run(TaskSource taskSource, Schema schema) {
96
+ }
97
+ });
98
+
99
+ // the list contains result.
100
+ final List<Map<String,Object>> resultList = new LinkedList<Map<String,Object>>();
101
+
102
+ plugin.run(task.dump(), schema,
103
+ new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(PATH_PREFIX))),
104
+ new TransactionalPageOutput() {
105
+
106
+ private final PageReader reader = new PageReader(schema);
107
+
108
+ @Override
109
+ public void add(Page page) {
110
+ reader.setPage(page);
111
+
112
+ while (reader.nextRecord()) {
113
+ final Map<String, Object> record = new HashMap<String, Object>();
114
+ for (Column column : schema.getColumns()) {
115
+ column.visit(new ColumnVisitor() {
116
+ @Override
117
+ public void timestampColumn(Column column) {
118
+ if (reader.isNull(column)) {
119
+ record.put(column.getName(), null);
120
+ return;
121
+ }
122
+ record.put(column.getName(), reader.getTimestamp(column));
123
+ }
124
+
125
+ @Override
126
+ public void stringColumn(Column column) {
127
+ if (reader.isNull(column)) {
128
+ record.put(column.getName(), null);
129
+ return;
130
+ }
131
+ record.put(column.getName(), reader.getString(column));
132
+ }
133
+
134
+ @Override
135
+ public void longColumn(Column column) {
136
+ if (reader.isNull(column)) {
137
+ record.put(column.getName(), null);
138
+ return;
139
+ }
140
+ record.put(column.getName(), reader.getLong(column));
141
+ }
142
+
143
+ @Override
144
+ public void doubleColumn(Column column) {
145
+ if (reader.isNull(column)) {
146
+ record.put(column.getName(), null);
147
+ return;
148
+ }
149
+ record.put(column.getName(), reader.getDouble(column));
150
+ }
151
+
152
+ @Override
153
+ public void booleanColumn(Column column) {
154
+ if (reader.isNull(column)) {
155
+ record.put(column.getName(), null);
156
+ return;
157
+ }
158
+ record.put(column.getName(), reader.getBoolean(column));
159
+ }
160
+
161
+ @Override
162
+ public void jsonColumn(Column column) {
163
+ if (reader.isNull(column)) {
164
+ record.put(column.getName(), null);
165
+ return;
166
+ }
167
+ record.put(column.getName(), reader.getString(column));
168
+ }
169
+ });
170
+ }
171
+ resultList.add(record);
172
+ }
173
+ }
174
+
175
+ @Override
176
+ public void finish() {
177
+ }
178
+
179
+ @Override
180
+ public void close() {
181
+ }
182
+
183
+ @Override
184
+ public void abort() {
185
+ }
186
+
187
+ @Override
188
+ public TaskReport commit() {
189
+ return Exec.newTaskReport();
190
+ }
191
+ });
192
+
193
+ //assert...
194
+ for (Map<String,Object> r : resultList) {
195
+ System.out.println(r);
196
+ }
197
+
198
+ assertEquals(2,resultList.size());
199
+
200
+ Map<String, Object> record0 = resultList.get(0);
201
+ assertEquals("Wikipedia:アップロードログ 2004年4月",record0.get("title"));
202
+ assertEquals(1L,record0.get("id"));
203
+ assertEquals("なんか書く",record0.get("revision/text"));
204
+ assertEquals(1083336360L * 1000L, ((Timestamp)record0.get("revision/timestamp")).toEpochMilli());
205
+
206
+ Map<String, Object> record1 = resultList.get(1);
207
+ assertEquals("アンパサンド",record1.get("title"));
208
+ assertEquals(5L,record1.get("id"));
209
+ assertEquals("アンパサンドとは\n「…と…」を意味する記号である。",record1.get("revision/text"));
210
+ assertEquals(1449883580L * 1000L, ((Timestamp)record1.get("revision/timestamp")).toEpochMilli());
211
+ }
212
+
213
+ private ConfigSource config() {
214
+ return Exec.newConfigSource().set("in", inputConfig()).set("root", "mediawiki/page")
215
+ .set("schema", schemaConfig()).set("out", outputConfig());
216
+ }
217
+
218
+ private ImmutableMap<String, Object> inputConfig() {
219
+ ImmutableMap.Builder<String, Object> builder = new ImmutableMap.Builder<>();
220
+ builder.put("type", "file");
221
+ builder.put("path_prefix", PATH_PREFIX);
222
+ builder.put("last_path", "");
223
+ return builder.build();
224
+ }
225
+
226
+ private ImmutableMap<String, Object> outputConfig() {
227
+ ImmutableMap.Builder<String, Object> builder = new ImmutableMap.Builder<>();
228
+ builder.put("type", "stdout");
229
+ return builder.build();
230
+ }
231
+
232
+ private ImmutableList<Object> schemaConfig() {
233
+ ImmutableList.Builder<Object> builder = new ImmutableList.Builder<>();
234
+ builder.add(ImmutableMap.of("name", "id", "type", "long"));
235
+ builder.add(ImmutableMap.of("name", "title", "type", "string"));
236
+ builder.add(ImmutableMap.of("name", "revision/timestamp", "type", "timestamp", "format", "%Y-%m-%dT%H:%M:%SZ", "timezone", "UTC"));
237
+ builder.add(ImmutableMap.of("name", "revision/text", "type", "string"));
238
+ return builder.build();
239
+ }
240
+ }
@@ -0,0 +1,80 @@
1
+ <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="ja">
2
+ <siteinfo>
3
+ <sitename>Wikipedia</sitename>
4
+ <dbname>jawiki</dbname>
5
+ <base>https://ja.wikipedia.org/wiki/%E3%83%A1%E3%82%A4%E3%83%B3%E3%83%9A%E3%83%BC%E3%82%B8</base>
6
+ <generator>MediaWiki 1.27.0-wmf.10</generator>
7
+ <case>first-letter</case>
8
+ <namespaces>
9
+ <namespace key="-2" case="first-letter">メディア</namespace>
10
+ <namespace key="-1" case="first-letter">特別</namespace>
11
+ <namespace key="0" case="first-letter" />
12
+ <namespace key="1" case="first-letter">ノート</namespace>
13
+ <namespace key="2" case="first-letter">利用者</namespace>
14
+ <namespace key="3" case="first-letter">利用者‐会話</namespace>
15
+ <namespace key="4" case="first-letter">Wikipedia</namespace>
16
+ <namespace key="5" case="first-letter">Wikipedia‐ノート</namespace>
17
+ <namespace key="6" case="first-letter">ファイル</namespace>
18
+ <namespace key="7" case="first-letter">ファイル‐ノート</namespace>
19
+ <namespace key="8" case="first-letter">MediaWiki</namespace>
20
+ <namespace key="9" case="first-letter">MediaWiki‐ノート</namespace>
21
+ <namespace key="10" case="first-letter">Template</namespace>
22
+ <namespace key="11" case="first-letter">Template‐ノート</namespace>
23
+ <namespace key="12" case="first-letter">Help</namespace>
24
+ <namespace key="13" case="first-letter">Help‐ノート</namespace>
25
+ <namespace key="14" case="first-letter">Category</namespace>
26
+ <namespace key="15" case="first-letter">Category‐ノート</namespace>
27
+ <namespace key="100" case="first-letter">Portal</namespace>
28
+ <namespace key="101" case="first-letter">Portal‐ノート</namespace>
29
+ <namespace key="102" case="first-letter">プロジェクト</namespace>
30
+ <namespace key="103" case="first-letter">プロジェクト‐ノート</namespace>
31
+ <namespace key="828" case="first-letter">モジュール</namespace>
32
+ <namespace key="829" case="first-letter">モジュール‐ノート</namespace>
33
+ <namespace key="2300" case="first-letter">Gadget</namespace>
34
+ <namespace key="2301" case="first-letter">Gadget talk</namespace>
35
+ <namespace key="2302" case="case-sensitive">Gadget definition</namespace>
36
+ <namespace key="2303" case="case-sensitive">Gadget definition talk</namespace>
37
+ <namespace key="2600" case="first-letter">Topic</namespace>
38
+ </namespaces>
39
+ </siteinfo>
40
+ <page>
41
+ <title>Wikipedia:アップロードログ 2004年4月</title>
42
+ <ns>4</ns>
43
+ <id>1</id>
44
+ <restrictions>sysop</restrictions>
45
+ <revision>
46
+ <id>2168855</id>
47
+ <parentid>299151</parentid>
48
+ <timestamp>2004-04-30T14:46:00Z</timestamp>
49
+ <contributor>
50
+ <username>Oxhop</username>
51
+ <id>2551</id>
52
+ </contributor>
53
+ <minor />
54
+ <comment>&quot;LocationMacedonia.png&quot;をアップロードしました。: マケドニアの位置 - 英語版より</comment>
55
+ <model>wikitext</model>
56
+ <format>text/x-wiki</format>
57
+ <text xml:space="preserve">なんか書く</text>
58
+ <sha1>gbhvqlwj7ga4v9ghhy0n88iqmlo19vz</sha1>
59
+ </revision>
60
+ </page>
61
+ <page>
62
+ <title>アンパサンド</title>
63
+ <ns>0</ns>
64
+ <id>5</id>
65
+ <revision>
66
+ <id>57857555</id>
67
+ <parentid>56511929</parentid>
68
+ <timestamp>2015-12-12T01:26:20Z</timestamp>
69
+ <contributor>
70
+ <username>&&&&amp;&amp;&amp;COXOXO</username>
71
+ <id>1043200</id>
72
+ </contributor>
73
+ <model>wikitext</model>
74
+ <format>text/x-wiki</format>
75
+ <text xml:space="preserve">アンパサンドとは
76
+ 「…と…」を意味する記号である。</text>
77
+ <sha1>esme5un4ixvpdpy5ri44s948usq3uhd</sha1>
78
+ </revision>
79
+ </page>
80
+ </mediawiki>
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-xml2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshi0309
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-08-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ~>
17
+ - !ruby/object:Gem::Version
18
+ version: '1.0'
19
+ name: bundler
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '10.0'
33
+ name: rake
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses Xml2 files read by other file input plugins.
42
+ email:
43
+ - takumyos@yahoo-corp.jp
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - LICENSE.txt
50
+ - README.md
51
+ - build.gradle
52
+ - config/checkstyle/checkstyle.xml
53
+ - config/checkstyle/default.xml
54
+ - gradle/wrapper/gradle-wrapper.jar
55
+ - gradle/wrapper/gradle-wrapper.properties
56
+ - gradlew
57
+ - gradlew.bat
58
+ - lib/embulk/guess/xml2.rb
59
+ - lib/embulk/parser/xml2.rb
60
+ - src/main/java/org/embulk/parser/xml2/Xml2ParserPlugin.java
61
+ - src/test/java/org/embulk/parser/xml2/TestXml2ParserPlugin.java
62
+ - src/test/resources/sample_01.xml
63
+ - classpath/embulk-parser-xml2-0.1.0.jar
64
+ homepage:
65
+ licenses:
66
+ - MIT
67
+ metadata: {}
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - '>='
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ requirements: []
83
+ rubyforge_project:
84
+ rubygems_version: 2.1.9
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Xml2 parser plugin for Embulk
88
+ test_files: []