jp-stopword-filter 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jp_stopword_filter-0.1.0/LICENSE +23 -0
- jp_stopword_filter-0.1.0/PKG-INFO +112 -0
- jp_stopword_filter-0.1.0/README.md +102 -0
- jp_stopword_filter-0.1.0/pyproject.toml +33 -0
- jp_stopword_filter-0.1.0/src/JaStopwordFilter.py +182 -0
- jp_stopword_filter-0.1.0/src/__init__.py +0 -0
- jp_stopword_filter-0.1.0/tests/__init__.py +0 -0
- jp_stopword_filter-0.1.0/tests/test_JaStopwordFilter.py +177 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
MIT License
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2024 Xu Liang
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
7
|
+
a copy of this software and associated documentation files (the
|
|
8
|
+
"Software"), to deal in the Software without restriction, including
|
|
9
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
10
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
11
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
12
|
+
the following conditions:
|
|
13
|
+
|
|
14
|
+
The above copyright notice and this permission notice shall be
|
|
15
|
+
included in all copies or substantial portions of the Software.
|
|
16
|
+
|
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
18
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
19
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
20
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
21
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
22
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
23
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: jp-stopword-filter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A stopword filter for Japanese
|
|
5
|
+
Author-Email: BrambleXu <liangxu006@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: numpy>=1.26.4
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# JaStopwordFilter
|
|
12
|
+
|
|
13
|
+
`JaStopwordFilter` is a lightweight Python library designed to filter stopwords from Japanese text based on customizable rules. It provides an efficient way to preprocess Japanese text for natural language processing (NLP) tasks, with support for common stopword removal techniques and user-defined customization.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- **Preloaded Stopwords**: Includes a comprehensive list of Japanese stopwords from SlothLib.
|
|
19
|
+
- **Customizable Rules**:
|
|
20
|
+
- Remove tokens based on **length**.
|
|
21
|
+
- Filter **dates** in common Japanese formats (e.g., `2024年11月`).
|
|
22
|
+
- Exclude **numbers**, **symbols**, **spaces**, and **emojis**.
|
|
23
|
+
- **Custom Wordlist**: Add your own stopwords to the filter.
|
|
24
|
+
- **Flexible Usage**: Use only the rules you need by enabling or disabling them during initialization.
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
Clone the repository and install the dependencies:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
git clone https://github.com/your-username/ja-stopword-filter.git
|
|
33
|
+
cd ja-stopword-filter
|
|
34
|
+
pip install -r requirements.txt
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
### Example Code
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from ja_stopword_filter import JaStopwordFilter
|
|
44
|
+
|
|
45
|
+
# Example token list
|
|
46
|
+
tokens = ["2024年11月", "こんにちは", "123", "!", "😊", "スペース", "短い", "custom"]
|
|
47
|
+
|
|
48
|
+
# Custom wordlist
|
|
49
|
+
custom_wordlist = ["custom", "スペース"]
|
|
50
|
+
|
|
51
|
+
# Initialize the filter
|
|
52
|
+
filter = JaStopwordFilter(
|
|
53
|
+
use_slothlib=True, # Use SlothLib stopwords
|
|
54
|
+
use_length=True, # Filter tokens with length <= 1
|
|
55
|
+
use_date=True, # Filter Japanese date formats
|
|
56
|
+
use_numbers=True, # Filter numeric tokens
|
|
57
|
+
use_symbols=True, # Filter symbolic tokens
|
|
58
|
+
use_spaces=True, # Filter whitespace-only tokens
|
|
59
|
+
use_emojis=True, # Filter emoji tokens
|
|
60
|
+
custom_wordlist=custom_wordlist # Add custom stopwords
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Filter tokens
|
|
64
|
+
filtered_tokens = filter.remove(tokens)
|
|
65
|
+
print(filtered_tokens) # Output: ['こんにちは', '短い']
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
## Parameters
|
|
70
|
+
|
|
71
|
+
The `JaStopwordFilter` class supports the following parameters during initialization:
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
| Parameter | Type | Default | Description |
|
|
75
|
+
|--- |--- |--- |---|
|
|
76
|
+
| `use_slothlib` | `bool` | `True` | Whether to use the SlothLib stopword list. |
|
|
77
|
+
| `use_length` | `bool` | `False` | Remove tokens with a length of 1 character or less. |
|
|
78
|
+
| `use_date` | `bool` | `False` | Remove tokens that match Japanese date formats. |
|
|
79
|
+
| `use_numbers` | `bool` | `False` | Remove numeric tokens. |
|
|
80
|
+
| `use_symbols` | `bool` | `False` | Remove symbolic tokens (e.g., `!`, `@`). |
|
|
81
|
+
| `use_spaces` | `bool` | `False` | Remove tokens that are empty or consist only of spaces. |
|
|
82
|
+
| `use_emojis` | `bool` | `False` | Remove tokens containing emojis. |
|
|
83
|
+
| `custom_wordlist` | `list` | `None` | A list of user-defined stopwords to remove. |
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
## Stopword Sources
|
|
87
|
+
|
|
88
|
+
### SlothLib Stopwords
|
|
89
|
+
If `use_slothlib` is set to `True`, the filter loads stopwords from a `slothlib.txt` file. Ensure this file is in the same directory as the script or adjust the file path in the `get_stopwords` function.
|
|
90
|
+
|
|
91
|
+
### Custom Wordlist
|
|
92
|
+
You can pass a list of custom stopwords using the `custom_wordlist` parameter. These will be merged with the SlothLib stopwords if enabled.
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
## Rules
|
|
96
|
+
|
|
97
|
+
The filter applies the following rules if they are enabled:
|
|
98
|
+
|
|
99
|
+
1. **Length Filtering**: Tokens with one or fewer characters are removed.
|
|
100
|
+
2. **Date Filtering**: Matches Japanese date patterns like:
|
|
101
|
+
- `YYYY年MM月`
|
|
102
|
+
- `MM月DD日`
|
|
103
|
+
- `YYYY年MM月DD日`
|
|
104
|
+
3. **Number Filtering**: Removes numeric tokens (`123`, `2024`).
|
|
105
|
+
4. **Symbol Filtering**: Removes punctuation and special symbols.
|
|
106
|
+
5. **Space Filtering**: Removes tokens that are empty or consist only of spaces.
|
|
107
|
+
6. **Emoji Filtering**: Detects and removes tokens containing emojis.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
## Contributing
|
|
111
|
+
|
|
112
|
+
Contributions are welcome! If you find a bug or have a feature request, feel free to open an issue or submit a pull request.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# JaStopwordFilter
|
|
2
|
+
|
|
3
|
+
`JaStopwordFilter` is a lightweight Python library designed to filter stopwords from Japanese text based on customizable rules. It provides an efficient way to preprocess Japanese text for natural language processing (NLP) tasks, with support for common stopword removal techniques and user-defined customization.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Features
|
|
7
|
+
|
|
8
|
+
- **Preloaded Stopwords**: Includes a comprehensive list of Japanese stopwords from SlothLib.
|
|
9
|
+
- **Customizable Rules**:
|
|
10
|
+
- Remove tokens based on **length**.
|
|
11
|
+
- Filter **dates** in common Japanese formats (e.g., `2024年11月`).
|
|
12
|
+
- Exclude **numbers**, **symbols**, **spaces**, and **emojis**.
|
|
13
|
+
- **Custom Wordlist**: Add your own stopwords to the filter.
|
|
14
|
+
- **Flexible Usage**: Use only the rules you need by enabling or disabling them during initialization.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
Clone the repository and install the dependencies:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
git clone https://github.com/your-username/ja-stopword-filter.git
|
|
23
|
+
cd ja-stopword-filter
|
|
24
|
+
pip install -r requirements.txt
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Example Code
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from ja_stopword_filter import JaStopwordFilter
|
|
34
|
+
|
|
35
|
+
# Example token list
|
|
36
|
+
tokens = ["2024年11月", "こんにちは", "123", "!", "😊", "スペース", "短い", "custom"]
|
|
37
|
+
|
|
38
|
+
# Custom wordlist
|
|
39
|
+
custom_wordlist = ["custom", "スペース"]
|
|
40
|
+
|
|
41
|
+
# Initialize the filter
|
|
42
|
+
filter = JaStopwordFilter(
|
|
43
|
+
use_slothlib=True, # Use SlothLib stopwords
|
|
44
|
+
use_length=True, # Filter tokens with length <= 1
|
|
45
|
+
use_date=True, # Filter Japanese date formats
|
|
46
|
+
use_numbers=True, # Filter numeric tokens
|
|
47
|
+
use_symbols=True, # Filter symbolic tokens
|
|
48
|
+
use_spaces=True, # Filter whitespace-only tokens
|
|
49
|
+
use_emojis=True, # Filter emoji tokens
|
|
50
|
+
custom_wordlist=custom_wordlist # Add custom stopwords
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Filter tokens
|
|
54
|
+
filtered_tokens = filter.remove(tokens)
|
|
55
|
+
print(filtered_tokens) # Output: ['こんにちは', '短い']
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
## Parameters
|
|
60
|
+
|
|
61
|
+
The `JaStopwordFilter` class supports the following parameters during initialization:
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
| Parameter | Type | Default | Description |
|
|
65
|
+
|--- |--- |--- |---|
|
|
66
|
+
| `use_slothlib` | `bool` | `True` | Whether to use the SlothLib stopword list. |
|
|
67
|
+
| `use_length` | `bool` | `False` | Remove tokens with a length of 1 character or less. |
|
|
68
|
+
| `use_date` | `bool` | `False` | Remove tokens that match Japanese date formats. |
|
|
69
|
+
| `use_numbers` | `bool` | `False` | Remove numeric tokens. |
|
|
70
|
+
| `use_symbols` | `bool` | `False` | Remove symbolic tokens (e.g., `!`, `@`). |
|
|
71
|
+
| `use_spaces` | `bool` | `False` | Remove tokens that are empty or consist only of spaces. |
|
|
72
|
+
| `use_emojis` | `bool` | `False` | Remove tokens containing emojis. |
|
|
73
|
+
| `custom_wordlist` | `list` | `None` | A list of user-defined stopwords to remove. |
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
## Stopword Sources
|
|
77
|
+
|
|
78
|
+
### SlothLib Stopwords
|
|
79
|
+
If `use_slothlib` is set to `True`, the filter loads stopwords from a `slothlib.txt` file. Ensure this file is in the same directory as the script or adjust the file path in the `get_stopwords` function.
|
|
80
|
+
|
|
81
|
+
### Custom Wordlist
|
|
82
|
+
You can pass a list of custom stopwords using the `custom_wordlist` parameter. These will be merged with the SlothLib stopwords if enabled.
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
## Rules
|
|
86
|
+
|
|
87
|
+
The filter applies the following rules if they are enabled:
|
|
88
|
+
|
|
89
|
+
1. **Length Filtering**: Tokens with one or fewer characters are removed.
|
|
90
|
+
2. **Date Filtering**: Matches Japanese date patterns like:
|
|
91
|
+
- `YYYY年MM月`
|
|
92
|
+
- `MM月DD日`
|
|
93
|
+
- `YYYY年MM月DD日`
|
|
94
|
+
3. **Number Filtering**: Removes numeric tokens (`123`, `2024`).
|
|
95
|
+
4. **Symbol Filtering**: Removes punctuation and special symbols.
|
|
96
|
+
5. **Space Filtering**: Removes tokens that are empty or consist only of spaces.
|
|
97
|
+
6. **Emoji Filtering**: Detects and removes tokens containing emojis.
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
## Contributing
|
|
101
|
+
|
|
102
|
+
Contributions are welcome! If you find a bug or have a feature request, feel free to open an issue or submit a pull request.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "jp-stopword-filter"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A stopword filter for Japanese"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "BrambleXu", email = "liangxu006@gmail.com" },
|
|
7
|
+
]
|
|
8
|
+
dependencies = [
|
|
9
|
+
"numpy>=1.26.4",
|
|
10
|
+
]
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
|
|
14
|
+
[project.license]
|
|
15
|
+
text = "MIT"
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = [
|
|
19
|
+
"pdm-backend",
|
|
20
|
+
]
|
|
21
|
+
build-backend = "pdm.backend"
|
|
22
|
+
|
|
23
|
+
[tool.pdm]
|
|
24
|
+
distribution = true
|
|
25
|
+
|
|
26
|
+
[tool.pdm.dev-dependencies]
|
|
27
|
+
dev = [
|
|
28
|
+
"ruff>=0.4.4",
|
|
29
|
+
"pytest>=8.2.0",
|
|
30
|
+
"mypy>=1.10.0",
|
|
31
|
+
"pre-commit>=3.7.1",
|
|
32
|
+
"codespell>=2.2.6",
|
|
33
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import unicodedata
|
|
3
|
+
from typing import List, Optional, Set
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_stopwords() -> List[str]:
|
|
7
|
+
"""
|
|
8
|
+
Reads the SlothLib stopwords from a file and returns them as a list.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
List[str]: A list of Japanese stopwords from SlothLib.
|
|
12
|
+
"""
|
|
13
|
+
file_path = "./src/slothlib.txt"
|
|
14
|
+
|
|
15
|
+
# Reading the file content and converting it into a list
|
|
16
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
|
17
|
+
content = file.read()
|
|
18
|
+
|
|
19
|
+
# Splitting the content by lines to get a list of stop words
|
|
20
|
+
japanese_stop_words_list = content.splitlines()
|
|
21
|
+
|
|
22
|
+
return japanese_stop_words_list
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def convert_to_halfwidth(text: str) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Converts a string from full-width to half-width characters.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
text (str): The input string.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
str: The converted string with half-width characters.
|
|
34
|
+
"""
|
|
35
|
+
return unicodedata.normalize("NFKC", text)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class JaStopwordFilter:
|
|
39
|
+
"""
|
|
40
|
+
A filter class to remove Japanese stopwords and other undesired tokens
|
|
41
|
+
based on customizable rules.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
convert_full_to_half: bool = True,
|
|
47
|
+
use_slothlib: bool = True,
|
|
48
|
+
filter_length: int = 0,
|
|
49
|
+
use_date: bool = False,
|
|
50
|
+
use_numbers: bool = False,
|
|
51
|
+
use_symbols: bool = False,
|
|
52
|
+
use_spaces: bool = False,
|
|
53
|
+
use_emojis: bool = False,
|
|
54
|
+
custom_wordlist: Optional[List[str]] = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Initializes the JaStopwordFilter with the specified filtering rules.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
convert_full_to_half (bool): Whether to convert full-width characters to half-width. Defaults to True.
|
|
61
|
+
use_slothlib (bool): Whether to use the SlothLib stopword list. Defaults to True.
|
|
62
|
+
filter_length (int): Remove tokens with a length less than or equal to this value.
|
|
63
|
+
Defaults to 0 (no filtering).
|
|
64
|
+
use_date (bool): Whether to remove tokens that match Japanese date patterns. Defaults to False.
|
|
65
|
+
use_numbers (bool): Whether to remove numeric tokens. Defaults to False.
|
|
66
|
+
use_symbols (bool): Whether to remove tokens consisting of symbols. Defaults to False.
|
|
67
|
+
use_spaces (bool): Whether to remove tokens that are empty or contain only spaces. Defaults to False.
|
|
68
|
+
use_emojis (bool): Whether to remove tokens containing emojis. Defaults to False.
|
|
69
|
+
custom_wordlist (Optional[List[str]]): A list of user-defined stopwords to remove. Defaults to None.
|
|
70
|
+
"""
|
|
71
|
+
self.stopwords: Set[str] = set()
|
|
72
|
+
self.filter_length = filter_length
|
|
73
|
+
self.use_date = use_date
|
|
74
|
+
self.use_numbers = use_numbers
|
|
75
|
+
self.use_symbols = use_symbols
|
|
76
|
+
self.use_spaces = use_spaces
|
|
77
|
+
self.use_emojis = use_emojis
|
|
78
|
+
self.convert_full_to_half = convert_full_to_half
|
|
79
|
+
|
|
80
|
+
# Load SlothLib stopwords
|
|
81
|
+
if use_slothlib:
|
|
82
|
+
self.stopwords.update(
|
|
83
|
+
convert_to_halfwidth(word) if self.convert_full_to_half else word for word in get_stopwords()
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Add custom wordlist to stopwords
|
|
87
|
+
if custom_wordlist:
|
|
88
|
+
self.stopwords.update(
|
|
89
|
+
convert_to_halfwidth(word) if self.convert_full_to_half else word for word in custom_wordlist
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def remove(self, tokens: List[str]) -> List[str]:
|
|
93
|
+
"""
|
|
94
|
+
Removes tokens based on the filtering rules.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
tokens (List[str]): A list of input tokens to filter.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List[str]: A list of filtered tokens.
|
|
101
|
+
"""
|
|
102
|
+
# Convert tokens to half-width if the option is enabled
|
|
103
|
+
if self.convert_full_to_half:
|
|
104
|
+
tokens = [convert_to_halfwidth(token) for token in tokens]
|
|
105
|
+
|
|
106
|
+
filtered_tokens: List[str] = []
|
|
107
|
+
for token in tokens:
|
|
108
|
+
if token in self.stopwords:
|
|
109
|
+
continue
|
|
110
|
+
if self.filter_length > 0 and len(token) <= self.filter_length:
|
|
111
|
+
continue
|
|
112
|
+
if self.use_date and self._is_date(token):
|
|
113
|
+
continue
|
|
114
|
+
if self.use_numbers and self._is_number(token):
|
|
115
|
+
continue
|
|
116
|
+
if self.use_symbols and self._is_symbol(token):
|
|
117
|
+
continue
|
|
118
|
+
if self.use_spaces and token.strip() == "":
|
|
119
|
+
continue
|
|
120
|
+
if self.use_emojis and self._is_emoji(token):
|
|
121
|
+
continue
|
|
122
|
+
filtered_tokens.append(token)
|
|
123
|
+
return filtered_tokens
|
|
124
|
+
|
|
125
|
+
def _is_date(self, token: str) -> bool:
|
|
126
|
+
"""
|
|
127
|
+
Checks if a token matches common Japanese date patterns.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
token (str): The token to check.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
bool: True if the token matches a date pattern, otherwise False.
|
|
134
|
+
"""
|
|
135
|
+
date_patterns = [
|
|
136
|
+
r"\d{4}年\d{1,2}月", # YYYY年MM月
|
|
137
|
+
r"\d{1,2}月\d{1,2}日", # MM月DD日
|
|
138
|
+
r"\d{4}年\d{1,2}月\d{1,2}日", # YYYY年MM月DD日
|
|
139
|
+
]
|
|
140
|
+
return any(re.match(pattern, token) for pattern in date_patterns)
|
|
141
|
+
|
|
142
|
+
def _is_number(self, token: str) -> bool:
|
|
143
|
+
"""
|
|
144
|
+
Checks if a token is numeric.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
token (str): The token to check.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
bool: True if the token is numeric, otherwise False.
|
|
151
|
+
"""
|
|
152
|
+
return token.isdigit()
|
|
153
|
+
|
|
154
|
+
def _is_symbol(self, token: str) -> bool:
|
|
155
|
+
"""
|
|
156
|
+
Checks if a token is a symbol.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
token (str): The token to check.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
bool: True if the token is a symbol, otherwise False.
|
|
163
|
+
"""
|
|
164
|
+
return re.fullmatch(r"[!-/:-@[-`{-~]", token) is not None
|
|
165
|
+
|
|
166
|
+
def _is_emoji(self, token: str) -> bool:
|
|
167
|
+
"""
|
|
168
|
+
Checks if a token contains emojis.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
token (str): The token to check.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
bool: True if the token contains emojis, otherwise False.
|
|
175
|
+
"""
|
|
176
|
+
return any(
|
|
177
|
+
"\U0001f600" <= char <= "\U0001f64f" # Emoticons
|
|
178
|
+
or "\U0001f300" <= char <= "\U0001f5ff" # Symbols & Pictographs
|
|
179
|
+
or "\U0001f680" <= char <= "\U0001f6ff" # Transport & Map Symbols
|
|
180
|
+
or "\U0001f1e0" <= char <= "\U0001f1ff" # Flags
|
|
181
|
+
for char in token
|
|
182
|
+
)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from src.JaStopwordFilter import JaStopwordFilter, convert_to_halfwidth, get_stopwords
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture
|
|
7
|
+
def tokens():
|
|
8
|
+
"""
|
|
9
|
+
A sample token list for testing.
|
|
10
|
+
"""
|
|
11
|
+
return [
|
|
12
|
+
"2024年11月", # Full-width date
|
|
13
|
+
"2024年11月", # Half-width date
|
|
14
|
+
"123", # Full-width number
|
|
15
|
+
"123", # Half-width number
|
|
16
|
+
"!", # Full-width symbol
|
|
17
|
+
"!", # Half-width symbol
|
|
18
|
+
"😊", # Emoji
|
|
19
|
+
"短", # Short token
|
|
20
|
+
"長い単語", # Long token
|
|
21
|
+
"custom", # Custom word
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_get_stopwords():
|
|
26
|
+
"""
|
|
27
|
+
Test if get_stopwords correctly loads stopwords from a file.
|
|
28
|
+
"""
|
|
29
|
+
stopwords = get_stopwords()
|
|
30
|
+
assert isinstance(stopwords, list), "Stopwords should be returned as a list."
|
|
31
|
+
assert len(stopwords) > 0, "Stopwords list should not be empty."
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_convert_to_halfwidth():
|
|
35
|
+
"""
|
|
36
|
+
Test if convert_to_halfwidth correctly converts full-width characters to half-width.
|
|
37
|
+
"""
|
|
38
|
+
assert convert_to_halfwidth("123") == "123", "Full-width numbers should convert to half-width."
|
|
39
|
+
assert convert_to_halfwidth("ABC") == "ABC", "Full-width letters should convert to half-width."
|
|
40
|
+
assert convert_to_halfwidth("!@#") == "!@#", "Full-width symbols should convert to half-width."
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_init_with_full_to_half(tokens):
|
|
44
|
+
"""
|
|
45
|
+
Test if the JaStopwordFilter correctly handles full-to-half conversion during initialization.
|
|
46
|
+
"""
|
|
47
|
+
filter = JaStopwordFilter(convert_full_to_half=True)
|
|
48
|
+
assert filter.convert_full_to_half, "convert_full_to_half should be True when enabled."
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_remove_with_full_to_half(tokens):
|
|
52
|
+
"""
|
|
53
|
+
Test if tokens are correctly filtered after converting full-width to half-width.
|
|
54
|
+
"""
|
|
55
|
+
custom_wordlist = ["123", "custom"]
|
|
56
|
+
filter = JaStopwordFilter(convert_full_to_half=True, custom_wordlist=custom_wordlist)
|
|
57
|
+
filtered = filter.remove(tokens)
|
|
58
|
+
assert "123" not in filtered, "Full-width '123' should be converted and removed."
|
|
59
|
+
assert "custom" not in filtered, "'custom' should be removed."
|
|
60
|
+
assert "2024年11月" not in filtered, "Full-width date should be converted and removed."
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_remove_without_full_to_half(tokens):
|
|
64
|
+
"""
|
|
65
|
+
Test if tokens are filtered without converting full-width to half-width.
|
|
66
|
+
"""
|
|
67
|
+
custom_wordlist = ["123", "custom"]
|
|
68
|
+
filter = JaStopwordFilter(convert_full_to_half=False, custom_wordlist=custom_wordlist)
|
|
69
|
+
filtered = filter.remove(tokens)
|
|
70
|
+
assert "123" not in filtered, "Full-width '123' should be removed."
|
|
71
|
+
assert "123" in filtered, "Half-width '123' should not be removed."
|
|
72
|
+
assert "custom" not in filtered, "'custom' should be removed."
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_filter_length(tokens):
|
|
76
|
+
"""
|
|
77
|
+
Test if tokens with length less than or equal to filter_length are correctly removed.
|
|
78
|
+
"""
|
|
79
|
+
filter = JaStopwordFilter(filter_length=2, convert_full_to_half=True)
|
|
80
|
+
filtered = filter.remove(tokens)
|
|
81
|
+
assert "短" not in filtered, "'短' should be removed because its length is <= 2."
|
|
82
|
+
assert "長い単語" in filtered, "'長い単語' should remain because its length is > 2."
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_remove_with_date(tokens):
|
|
86
|
+
"""
|
|
87
|
+
Test if tokens matching date patterns are correctly removed.
|
|
88
|
+
"""
|
|
89
|
+
filter = JaStopwordFilter(use_date=True, convert_full_to_half=True)
|
|
90
|
+
filtered = filter.remove(tokens)
|
|
91
|
+
assert "2024年11月" not in filtered, "Full-width date should be converted and removed."
|
|
92
|
+
assert "2024年11月" not in filtered, "Half-width date should be removed."
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_remove_with_numbers(tokens):
|
|
96
|
+
"""
|
|
97
|
+
Test if numeric tokens are correctly removed.
|
|
98
|
+
"""
|
|
99
|
+
filter = JaStopwordFilter(use_numbers=True, convert_full_to_half=True)
|
|
100
|
+
filtered = filter.remove(tokens)
|
|
101
|
+
assert "123" not in filtered, "Full-width number should be converted and removed."
|
|
102
|
+
assert "123" not in filtered, "Half-width number should be removed."
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_remove_with_symbols(tokens):
|
|
106
|
+
"""
|
|
107
|
+
Test if symbolic tokens are correctly removed.
|
|
108
|
+
"""
|
|
109
|
+
filter = JaStopwordFilter(use_symbols=True, convert_full_to_half=True)
|
|
110
|
+
filtered = filter.remove(tokens)
|
|
111
|
+
assert "!" not in filtered, "Full-width symbol should be converted and removed."
|
|
112
|
+
assert "!" not in filtered, "Half-width symbol should be removed."
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_remove_with_spaces():
|
|
116
|
+
"""
|
|
117
|
+
Test if empty or whitespace-only tokens are correctly removed.
|
|
118
|
+
"""
|
|
119
|
+
tokens_with_spaces = [" ", " ", "custom"]
|
|
120
|
+
filter = JaStopwordFilter(use_spaces=True, convert_full_to_half=True)
|
|
121
|
+
filtered = filter.remove(tokens_with_spaces)
|
|
122
|
+
assert " " not in filtered, "Half-width space should be removed."
|
|
123
|
+
assert " " not in filtered, "Full-width space should be removed."
|
|
124
|
+
assert "custom" in filtered, "'custom' should remain."
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_remove_with_emojis(tokens):
|
|
128
|
+
"""
|
|
129
|
+
Test if tokens containing emojis are correctly removed.
|
|
130
|
+
"""
|
|
131
|
+
filter = JaStopwordFilter(use_emojis=True, convert_full_to_half=True)
|
|
132
|
+
filtered = filter.remove(tokens)
|
|
133
|
+
assert "😊" not in filtered, "Emoji '😊' should be removed."
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_combined_rules(tokens):
|
|
137
|
+
"""
|
|
138
|
+
Test if the filter works correctly with combined rules and full-to-half conversion.
|
|
139
|
+
"""
|
|
140
|
+
custom_wordlist = ["123", "custom"]
|
|
141
|
+
filter = JaStopwordFilter(
|
|
142
|
+
filter_length=2,
|
|
143
|
+
use_date=True,
|
|
144
|
+
use_numbers=True,
|
|
145
|
+
use_symbols=True,
|
|
146
|
+
use_emojis=True,
|
|
147
|
+
convert_full_to_half=True,
|
|
148
|
+
custom_wordlist=custom_wordlist,
|
|
149
|
+
)
|
|
150
|
+
filtered = filter.remove(tokens)
|
|
151
|
+
assert "2024年11月" not in filtered, "Full-width date should be converted and removed."
|
|
152
|
+
assert "123" not in filtered, "Half-width number should be removed."
|
|
153
|
+
assert "123" not in filtered, "Full-width number should be converted and removed."
|
|
154
|
+
assert "!" not in filtered, "Full-width symbol should be converted and removed."
|
|
155
|
+
assert "😊" not in filtered, "Emoji '😊' should be removed."
|
|
156
|
+
assert "短" not in filtered, "Tokens with length <= 2 should be removed."
|
|
157
|
+
assert "長い単語" in filtered, "'長い単語' should remain in the filtered list."
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_no_rules(tokens):
|
|
161
|
+
"""
|
|
162
|
+
Test if all tokens are retained when no rules are enabled.
|
|
163
|
+
"""
|
|
164
|
+
filter = JaStopwordFilter(convert_full_to_half=False, use_slothlib=False)
|
|
165
|
+
filtered = filter.remove(tokens)
|
|
166
|
+
assert filtered == [
|
|
167
|
+
"2024年11月",
|
|
168
|
+
"2024年11月",
|
|
169
|
+
"123",
|
|
170
|
+
"123",
|
|
171
|
+
"!",
|
|
172
|
+
"!",
|
|
173
|
+
"😊",
|
|
174
|
+
"短",
|
|
175
|
+
"長い単語",
|
|
176
|
+
"custom",
|
|
177
|
+
], "All tokens should remain when no rules are enabled."
|